diff --git a/linux-tkg-patches/5.18/0008-5.18-bcachefs.patch b/linux-tkg-patches/5.18/0008-5.18-bcachefs.patch
index 8cb13cf..fc7456c 100644
--- a/linux-tkg-patches/5.18/0008-5.18-bcachefs.patch
+++ b/linux-tkg-patches/5.18/0008-5.18-bcachefs.patch
@@ -1,10 +1,11 @@
-From e2bc97d02026d17fad53c5b34ff4ca9aacf45080 Mon Sep 17 00:00:00 2001
-From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Sat, 2 Jul 2022 02:47:15 +0200
-Subject: [PATCH] bcachefs-5.18: introduce bcachefs patchset
+From 98a9e9a069c0619986de099d587dae0158d82eac Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 22 Jul 2022 14:22:01 +0200
+Subject: [PATCH] 5.18-bcachefs
 
-Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
+ .github/ISSUE_TEMPLATE/bug_report.md          |   61 +
  Documentation/core-api/printk-formats.rst     |   22 +
  arch/powerpc/kernel/process.c                 |   16 +-
  arch/powerpc/kernel/security.c                |   75 +-
@@ -25,18 +26,18 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  drivers/pci/p2pdma.c                          |   21 +-
  fs/Kconfig                                    |    1 +
  fs/Makefile                                   |    1 +
- fs/bcachefs/Kconfig                           |   52 +
- fs/bcachefs/Makefile                          |   68 +
+ fs/bcachefs/Kconfig                           |   59 +
+ fs/bcachefs/Makefile                          |   69 +
  fs/bcachefs/acl.c                             |  406 ++
  fs/bcachefs/acl.h                             |   58 +
- fs/bcachefs/alloc_background.c                | 1600 ++++++++
- fs/bcachefs/alloc_background.h                |  181 +
- fs/bcachefs/alloc_foreground.c                | 1282 ++++++
- fs/bcachefs/alloc_foreground.h                |  173 +
+ fs/bcachefs/alloc_background.c                | 1552 ++++++++
+ fs/bcachefs/alloc_background.h                |  183 +
+ fs/bcachefs/alloc_foreground.c                | 1380 +++++++
+ fs/bcachefs/alloc_foreground.h                |  181 +
  fs/bcachefs/alloc_types.h                     |   87 +
- fs/bcachefs/backpointers.c                    |  891 +++++
+ fs/bcachefs/backpointers.c                    |  875 ++++
  fs/bcachefs/backpointers.h                    |   38 +
- fs/bcachefs/bcachefs.h                        |  988 +++++
+ fs/bcachefs/bcachefs.h                        | 1000 +++++
  fs/bcachefs/bcachefs_format.h                 | 2052 ++++++++++
  fs/bcachefs/bcachefs_ioctl.h                  |  368 ++
  fs/bcachefs/bkey.c                            | 1175 ++++++
@@ -48,23 +49,23 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  fs/bcachefs/bkey_sort.h                       |   44 +
  fs/bcachefs/bset.c                            | 1598 ++++++++
  fs/bcachefs/bset.h                            |  615 +++
- fs/bcachefs/btree_cache.c                     | 1162 ++++++
+ fs/bcachefs/btree_cache.c                     | 1170 ++++++
  fs/bcachefs/btree_cache.h                     |  107 +
- fs/bcachefs/btree_gc.c                        | 2128 ++++++++++
+ fs/bcachefs/btree_gc.c                        | 2098 ++++++++++
  fs/bcachefs/btree_gc.h                        |  112 +
  fs/bcachefs/btree_io.c                        | 2150 ++++++++++
  fs/bcachefs/btree_io.h                        |  222 ++
- fs/bcachefs/btree_iter.c                      | 3471 ++++++++++++++++
- fs/bcachefs/btree_iter.h                      |  411 ++
- fs/bcachefs/btree_key_cache.c                 |  850 ++++
+ fs/bcachefs/btree_iter.c                      | 3515 +++++++++++++++++
+ fs/bcachefs/btree_iter.h                      |  556 +++
+ fs/bcachefs/btree_key_cache.c                 |  855 ++++
  fs/bcachefs/btree_key_cache.h                 |   47 +
- fs/bcachefs/btree_locking.h                   |  259 ++
- fs/bcachefs/btree_types.h                     |  687 ++++
- fs/bcachefs/btree_update.h                    |  156 +
- fs/bcachefs/btree_update_interior.c           | 2253 +++++++++++
+ fs/bcachefs/btree_locking.h                   |  289 ++
+ fs/bcachefs/btree_types.h                     |  697 ++++
+ fs/bcachefs/btree_update.h                    |  158 +
+ fs/bcachefs/btree_update_interior.c           | 2266 +++++++++++
  fs/bcachefs/btree_update_interior.h           |  321 ++
- fs/bcachefs/btree_update_leaf.c               | 1815 +++++++++
- fs/bcachefs/buckets.c                         | 2114 ++++++++++
+ fs/bcachefs/btree_update_leaf.c               | 1800 +++++++++
+ fs/bcachefs/buckets.c                         | 2113 ++++++++++
  fs/bcachefs/buckets.h                         |  300 ++
  fs/bcachefs/buckets_types.h                   |  103 +
  fs/bcachefs/buckets_waiting_for_journal.c     |  167 +
@@ -72,7 +73,7 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  .../buckets_waiting_for_journal_types.h       |   23 +
  fs/bcachefs/chardev.c                         |  760 ++++
  fs/bcachefs/chardev.h                         |   31 +
- fs/bcachefs/checksum.c                        |  707 ++++
+ fs/bcachefs/checksum.c                        |  712 ++++
  fs/bcachefs/checksum.h                        |  204 +
  fs/bcachefs/clock.c                           |  191 +
  fs/bcachefs/clock.h                           |   38 +
@@ -82,20 +83,21 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  fs/bcachefs/counters.c                        |  107 +
  fs/bcachefs/counters.h                        |   17 +
  fs/bcachefs/darray.h                          |   77 +
- fs/bcachefs/data_update.c                     |  379 ++
+ fs/bcachefs/data_update.c                     |  376 ++
  fs/bcachefs/data_update.h                     |   38 +
- fs/bcachefs/debug.c                           |  707 ++++
+ fs/bcachefs/debug.c                           |  764 ++++
  fs/bcachefs/debug.h                           |   30 +
  fs/bcachefs/dirent.c                          |  565 +++
  fs/bcachefs/dirent.h                          |   67 +
  fs/bcachefs/disk_groups.c                     |  506 +++
  fs/bcachefs/disk_groups.h                     |   90 +
- fs/bcachefs/ec.c                              | 1695 ++++++++
+ fs/bcachefs/ec.c                              | 1673 ++++++++
  fs/bcachefs/ec.h                              |  230 ++
  fs/bcachefs/ec_types.h                        |   46 +
- fs/bcachefs/errcode.h                         |   12 +
- fs/bcachefs/error.c                           |  185 +
- fs/bcachefs/error.h                           |  238 ++
+ fs/bcachefs/errcode.c                         |   51 +
+ fs/bcachefs/errcode.h                         |   64 +
+ fs/bcachefs/error.c                           |  184 +
+ fs/bcachefs/error.h                           |  223 ++
  fs/bcachefs/extent_update.c                   |  178 +
  fs/bcachefs/extent_update.h                   |   12 +
  fs/bcachefs/extents.c                         | 1324 +++++++
@@ -105,24 +107,24 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  fs/bcachefs/fifo.h                            |  127 +
  fs/bcachefs/fs-common.c                       |  496 +++
  fs/bcachefs/fs-common.h                       |   43 +
- fs/bcachefs/fs-io.c                           | 3496 +++++++++++++++++
+ fs/bcachefs/fs-io.c                           | 3496 ++++++++++++++++
  fs/bcachefs/fs-io.h                           |   56 +
  fs/bcachefs/fs-ioctl.c                        |  523 +++
  fs/bcachefs/fs-ioctl.h                        |   81 +
  fs/bcachefs/fs.c                              | 1939 +++++++++
  fs/bcachefs/fs.h                              |  208 +
- fs/bcachefs/fsck.c                            | 2413 ++++++++++++
+ fs/bcachefs/fsck.c                            | 2390 +++++++++++
  fs/bcachefs/fsck.h                            |    8 +
  fs/bcachefs/inode.c                           |  771 ++++
  fs/bcachefs/inode.h                           |  189 +
- fs/bcachefs/io.c                              | 2417 ++++++++++++
+ fs/bcachefs/io.c                              | 2422 ++++++++++++
  fs/bcachefs/io.h                              |  189 +
  fs/bcachefs/io_types.h                        |  161 +
  fs/bcachefs/journal.c                         | 1429 +++++++
  fs/bcachefs/journal.h                         |  521 +++
  fs/bcachefs/journal_io.c                      | 1735 ++++++++
  fs/bcachefs/journal_io.h                      |   59 +
- fs/bcachefs/journal_reclaim.c                 |  849 ++++
+ fs/bcachefs/journal_reclaim.c                 |  852 ++++
  fs/bcachefs/journal_reclaim.h                 |   86 +
  fs/bcachefs/journal_sb.c                      |  220 ++
  fs/bcachefs/journal_sb.h                      |   24 +
@@ -132,26 +134,26 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  fs/bcachefs/keylist.c                         |   67 +
  fs/bcachefs/keylist.h                         |   76 +
  fs/bcachefs/keylist_types.h                   |   16 +
- fs/bcachefs/lru.c                             |  219 ++
+ fs/bcachefs/lru.c                             |  206 +
  fs/bcachefs/lru.h                             |   19 +
- fs/bcachefs/migrate.c                         |  193 +
+ fs/bcachefs/migrate.c                         |  186 +
  fs/bcachefs/migrate.h                         |    7 +
- fs/bcachefs/move.c                            |  951 +++++
+ fs/bcachefs/move.c                            |  952 +++++
  fs/bcachefs/move.h                            |   67 +
  fs/bcachefs/move_types.h                      |   19 +
- fs/bcachefs/movinggc.c                        |  282 ++
- fs/bcachefs/movinggc.h                        |    9 +
+ fs/bcachefs/movinggc.c                        |  285 ++
+ fs/bcachefs/movinggc.h                        |   10 +
  fs/bcachefs/opts.c                            |  578 +++
- fs/bcachefs/opts.h                            |  504 +++
- fs/bcachefs/quota.c                           |  859 ++++
+ fs/bcachefs/opts.h                            |  509 +++
+ fs/bcachefs/quota.c                           |  823 ++++
  fs/bcachefs/quota.h                           |   71 +
  fs/bcachefs/quota_types.h                     |   43 +
- fs/bcachefs/rebalance.c                       |  358 ++
+ fs/bcachefs/rebalance.c                       |  361 ++
  fs/bcachefs/rebalance.h                       |   28 +
  fs/bcachefs/rebalance_types.h                 |   26 +
- fs/bcachefs/recovery.c                        | 1584 ++++++++
+ fs/bcachefs/recovery.c                        | 1597 ++++++++
  fs/bcachefs/recovery.h                        |   58 +
- fs/bcachefs/reflink.c                         |  421 ++
+ fs/bcachefs/reflink.c                         |  422 ++
  fs/bcachefs/reflink.h                         |   76 +
  fs/bcachefs/replicas.c                        | 1073 +++++
  fs/bcachefs/replicas.h                        |  106 +
@@ -159,20 +161,20 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  fs/bcachefs/siphash.c                         |  173 +
  fs/bcachefs/siphash.h                         |   87 +
  fs/bcachefs/str_hash.h                        |  351 ++
- fs/bcachefs/subvolume.c                       | 1095 ++++++
- fs/bcachefs/subvolume.h                       |  126 +
+ fs/bcachefs/subvolume.c                       | 1108 ++++++
+ fs/bcachefs/subvolume.h                       |  137 +
  fs/bcachefs/subvolume_types.h                 |    9 +
  fs/bcachefs/super-io.c                        | 1602 ++++++++
  fs/bcachefs/super-io.h                        |  126 +
- fs/bcachefs/super.c                           | 1970 ++++++++++
+ fs/bcachefs/super.c                           | 1950 +++++++++
  fs/bcachefs/super.h                           |  264 ++
  fs/bcachefs/super_types.h                     |   51 +
  fs/bcachefs/sysfs.c                           |  943 +++++
  fs/bcachefs/sysfs.h                           |   48 +
- fs/bcachefs/tests.c                           |  947 +++++
+ fs/bcachefs/tests.c                           |  976 +++++
  fs/bcachefs/tests.h                           |   15 +
  fs/bcachefs/trace.c                           |   12 +
- fs/bcachefs/util.c                            |  958 +++++
+ fs/bcachefs/util.c                            |  964 +++++
  fs/bcachefs/util.h                            |  783 ++++
  fs/bcachefs/varint.c                          |  121 +
  fs/bcachefs/varint.h                          |   11 +
@@ -204,13 +206,16 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  include/linux/trace_events.h                  |    2 +-
  include/linux/trace_seq.h                     |   17 +-
  include/linux/vmalloc.h                       |    1 +
- include/trace/events/bcachefs.h               | 1020 +++++
+ include/net/9p/9p.h                           |    2 +-
+ include/net/9p/client.h                       |   20 +-
+ include/trace/events/bcachefs.h               | 1048 +++++
  init/init_task.c                              |    1 +
  kernel/Kconfig.locks                          |    3 +
  kernel/locking/Makefile                       |    1 +
  kernel/locking/lockdep.c                      |   20 +
  kernel/locking/six.c                          |  759 ++++
  kernel/module.c                               |    4 +-
+ kernel/stacktrace.c                           |    2 +
  kernel/trace/trace.c                          |   45 +-
  kernel/trace/trace_dynevent.c                 |   34 +-
  kernel/trace/trace_events_filter.c            |    2 +-
@@ -222,6 +227,7 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  lib/Kconfig.debug                             |    9 +
  lib/Makefile                                  |    8 +-
  {drivers/md/bcache => lib}/closure.c          |   35 +-
+ lib/errname.c                                 |    1 +
  lib/generic-radix-tree.c                      |   76 +-
  lib/hexdump.c                                 |  246 +-
  lib/pretty-printers.c                         |   60 +
@@ -241,8 +247,14 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  mm/slab_common.c                              |   53 +-
  mm/vmalloc.c                                  |   21 +
  mm/vmscan.c                                   |   88 +
+ net/9p/client.c                               |   97 +-
+ net/9p/trans_fd.c                             |   12 +-
+ net/9p/trans_rdma.c                           |    4 +-
+ net/9p/trans_virtio.c                         |    4 +-
+ net/9p/trans_xen.c                            |    2 +-
  tools/testing/nvdimm/test/ndtest.c            |   22 +-
- 237 files changed, 83816 insertions(+), 2162 deletions(-)
+ 248 files changed, 84382 insertions(+), 2223 deletions(-)
+ create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
  create mode 100644 fs/bcachefs/Kconfig
  create mode 100644 fs/bcachefs/Makefile
  create mode 100644 fs/bcachefs/acl.c
@@ -311,6 +323,7 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  create mode 100644 fs/bcachefs/ec.c
  create mode 100644 fs/bcachefs/ec.h
  create mode 100644 fs/bcachefs/ec_types.h
+ create mode 100644 fs/bcachefs/errcode.c
  create mode 100644 fs/bcachefs/errcode.h
  create mode 100644 fs/bcachefs/error.c
  create mode 100644 fs/bcachefs/error.h
@@ -410,8 +423,75 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  delete mode 100644 lib/seq_buf.c
  rename {lib => mm}/show_mem.c (83%)
 
+diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
+new file mode 100644
+index 000000000000..8af34357dd98
+--- /dev/null
++++ b/.github/ISSUE_TEMPLATE/bug_report.md
+@@ -0,0 +1,61 @@
++---
++name: Bug report
++about: Create a report to help us improve
++title: "<short description> [short commit id]"
++labels: bug
++assignees: YellowOnion
++
++---
++
++**Please search for duplicates**
++
++**Version**
++
++Make sure you're using a reasonably new version.
++
++Provide the commit hash from the kernel version (preferable) or tools, don't say "I'm using the latest master" as that will very quickly become out of date.
++
++**Generic info**
++Provide the output of:
++```
++bcachefs fs usage
++bcachefs show-super
++```
++**Tools bugs**
++
++* pull the latest version, compile it, do not strip the binary.
++* provide the exact commands you used to run.
++* run with gdb: `gdb -ex run --args ./bcacehfs <arguments...>`
++
++If you get an assert/segfault etc:
++* type `bt` in to and provide the output here.
++
++If the tools lockup:
++* run `perf top -p $(pidof bcachefs)` and provide a screenshot.
++* press ctrl+c to interrupt the process and provide the output of `bt`.
++
++**Kernel bugs**
++Compile the kernel with these flags:
++
++```
++CONFIG_PREEMPT=y
++CONFIG_BCACHEFS_DEBUG=y
++CONFIG_KALLSYMS=y
++CONFIG_KALLSYMS_ALL=y
++CONFIG_DEBUG_FS=y
++CONFIG_DYNAMIC_FTRACE=y
++CONFIG_FTRACE=y
++```
++Provide the output of `dmesg` either in a paste-bin or as attachment, if less than 30~ lines just provide inline here.
++
++
++**Optional Advanced**
++
++If lockup or performance issues:
++* run `perf record` and `perf record -e 'bcachefs:*' -o events.data` both during the window of issue and then ctrl+c.
++* run `perf archive` to dump symbols.
++* archive, compress and upload the files: `perf.data`, `events.data` and `perf.data.tar.bz2`.
++
++Upload large files to a file storage provider:
++* provide the output of `bcachefs list_journal -a <list of devices> | zstd -f -T0 -o ../journal.log.zst`
++*compress & upload all the `metdata.dump.*` files from: bcachefs dump -o metadata.dump <list of devices>
 diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
-index 5e89497ba..4f4a35b3a 100644
+index 5e89497ba314..4f4a35b3aadc 100644
 --- a/Documentation/core-api/printk-formats.rst
 +++ b/Documentation/core-api/printk-formats.rst
 @@ -625,6 +625,28 @@ Examples::
@@ -444,7 +524,7 @@ index 5e89497ba..4f4a35b3a 100644
  ======
  
 diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
-index 9be279469..4212864c8 100644
+index 9be279469a85..4212864c81d5 100644
 --- a/arch/powerpc/kernel/process.c
 +++ b/arch/powerpc/kernel/process.c
 @@ -39,7 +39,7 @@
@@ -496,7 +576,7 @@ index 9be279469..4212864c8 100644
  }
  
 diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
-index d96fd14bd..b34de62e6 100644
+index d96fd14bd7c9..b34de62e65ce 100644
 --- a/arch/powerpc/kernel/security.c
 +++ b/arch/powerpc/kernel/security.c
 @@ -10,7 +10,7 @@
@@ -645,7 +725,7 @@ index d96fd14bd..b34de62e6 100644
  
  #ifdef CONFIG_PPC_BOOK3S_64
 diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
-index 82cae0897..fe2b41858 100644
+index 82cae08976bc..fe2b41858b5f 100644
 --- a/arch/powerpc/platforms/pseries/papr_scm.c
 +++ b/arch/powerpc/platforms/pseries/papr_scm.c
 @@ -12,7 +12,7 @@
@@ -738,7 +818,7 @@ index 82cae0897..fe2b41858 100644
  DEVICE_ATTR_RO(flags);
  
 diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
-index 83f901e2c..5b6720b6a 100644
+index 83f901e2c2df..5b6720b6a417 100644
 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
 +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
 @@ -19,7 +19,7 @@
@@ -805,7 +885,7 @@ index 83f901e2c..5b6720b6a 100644
  	ret = rdtgroup_setup_root();
  	if (ret)
 diff --git a/block/bio.c b/block/bio.c
-index d3ca79c3e..8779a80f8 100644
+index d3ca79c3ebdf..8779a80f8156 100644
 --- a/block/bio.c
 +++ b/block/bio.c
 @@ -553,15 +553,15 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
@@ -880,7 +960,7 @@ index d3ca79c3e..8779a80f8 100644
  static inline bool bio_remaining_done(struct bio *bio)
  {
 diff --git a/block/blk-core.c b/block/blk-core.c
-index a7329475a..a0929889c 100644
+index a7329475aba2..a0929889cf27 100644
 --- a/block/blk-core.c
 +++ b/block/blk-core.c
 @@ -207,6 +207,7 @@ const char *blk_status_to_str(blk_status_t status)
@@ -892,7 +972,7 @@ index a7329475a..a0929889c 100644
  /**
   * blk_sync_queue - cancel any pending callbacks on a queue
 diff --git a/block/blk.h b/block/blk.h
-index 8ccbc6e07..16067c4ac 100644
+index 8ccbc6e07636..16067c4ac775 100644
 --- a/block/blk.h
 +++ b/block/blk.h
 @@ -240,7 +240,6 @@ static inline void blk_integrity_del(struct gendisk *disk)
@@ -904,7 +984,7 @@ index 8ccbc6e07..16067c4ac 100644
  bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
  		unsigned int nr_segs);
 diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c
-index c740f0faa..90aa034dc 100644
+index c740f0faad39..90aa034dceb0 100644
 --- a/drivers/acpi/apei/erst-dbg.c
 +++ b/drivers/acpi/apei/erst-dbg.c
 @@ -11,6 +11,7 @@
@@ -916,7 +996,7 @@ index c740f0faa..90aa034dc 100644
  #include <linux/module.h>
  #include <linux/uaccess.h>
 diff --git a/drivers/block/loop.c b/drivers/block/loop.c
-index 4e1dce3be..0e822f3ef 100644
+index 4e1dce3beab0..0e822f3ef912 100644
 --- a/drivers/block/loop.c
 +++ b/drivers/block/loop.c
 @@ -1153,8 +1153,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
@@ -929,7 +1009,7 @@ index 4e1dce3be..0e822f3ef 100644
  		int err;
  
 diff --git a/drivers/clk/tegra/clk-bpmp.c b/drivers/clk/tegra/clk-bpmp.c
-index 6ecf18f71..301551174 100644
+index 6ecf18f71c32..301551174c13 100644
 --- a/drivers/clk/tegra/clk-bpmp.c
 +++ b/drivers/clk/tegra/clk-bpmp.c
 @@ -5,7 +5,7 @@
@@ -991,7 +1071,7 @@ index 6ecf18f71..301551174 100644
  
  static int tegra_bpmp_probe_clocks(struct tegra_bpmp *bpmp,
 diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
-index 3088c5b82..a8c5f90e8 100644
+index 3088c5b829f0..a8c5f90e8208 100644
 --- a/drivers/input/joystick/analog.c
 +++ b/drivers/input/joystick/analog.c
 @@ -19,7 +19,7 @@
@@ -1038,7 +1118,7 @@ index 3088c5b82..a8c5f90e8 100644
  
  /*
 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
-index cf3e80969..f1a1f0c4a 100644
+index cf3e8096942a..f1a1f0c4a0ea 100644
 --- a/drivers/md/bcache/Kconfig
 +++ b/drivers/md/bcache/Kconfig
 @@ -4,6 +4,7 @@ config BCACHE
@@ -1066,7 +1146,7 @@ index cf3e80969..f1a1f0c4a 100644
  	bool "Asynchronous device registration (EXPERIMENTAL)"
  	depends on BCACHE
 diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
-index 5b87e5967..054e8a33a 100644
+index 5b87e59676b8..054e8a33a7ab 100644
 --- a/drivers/md/bcache/Makefile
 +++ b/drivers/md/bcache/Makefile
 @@ -2,6 +2,6 @@
@@ -1079,7 +1159,7 @@ index 5b87e5967..054e8a33a 100644
 +	journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
  	util.o writeback.o features.o
 diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
-index 9ed9c955a..dbb72beb0 100644
+index 9ed9c955add7..dbb72beb036c 100644
 --- a/drivers/md/bcache/bcache.h
 +++ b/drivers/md/bcache/bcache.h
 @@ -179,6 +179,7 @@
@@ -1099,7 +1179,7 @@ index 9ed9c955a..dbb72beb0 100644
  struct bucket {
  	atomic_t	pin;
 diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
-index 2bb55278d..4a517301d 100644
+index 2bb55278d22d..4a517301db08 100644
 --- a/drivers/md/bcache/super.c
 +++ b/drivers/md/bcache/super.c
 @@ -2914,7 +2914,6 @@ static int __init bcache_init(void)
@@ -1111,7 +1191,7 @@ index 2bb55278d..4a517301d 100644
  	bcache_is_reboot = false;
  
 diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
-index 6f3cb7c92..f61ab1bad 100644
+index 6f3cb7c92130..f61ab1bada6c 100644
 --- a/drivers/md/bcache/util.h
 +++ b/drivers/md/bcache/util.h
 @@ -4,6 +4,7 @@
@@ -1132,7 +1212,7 @@ index 6f3cb7c92..f61ab1bad 100644
  
  #ifdef CONFIG_BCACHE_DEBUG
 diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
-index 30b1df3c9..3b7a6ca44 100644
+index 30b1df3c9d2f..3b7a6ca44668 100644
 --- a/drivers/pci/p2pdma.c
 +++ b/drivers/pci/p2pdma.c
 @@ -17,7 +17,7 @@
@@ -1207,7 +1287,7 @@ index 30b1df3c9..3b7a6ca44 100644
  	acs_redirects = true;
  
 diff --git a/fs/Kconfig b/fs/Kconfig
-index 30b751c7f..1160311af 100644
+index 30b751c7f11a..1160311af303 100644
 --- a/fs/Kconfig
 +++ b/fs/Kconfig
 @@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig"
@@ -1219,7 +1299,7 @@ index 30b751c7f..1160311af 100644
  
  endif # BLOCK
 diff --git a/fs/Makefile b/fs/Makefile
-index 208a74e0b..5d5c8c792 100644
+index 208a74e0b00e..5d5c8c792058 100644
 --- a/fs/Makefile
 +++ b/fs/Makefile
 @@ -134,6 +134,7 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
@@ -1232,10 +1312,10 @@ index 208a74e0b..5d5c8c792 100644
  obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
 diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
 new file mode 100644
-index 000000000..27742ce27
+index 000000000000..008886967841
 --- /dev/null
 +++ b/fs/bcachefs/Kconfig
-@@ -0,0 +1,52 @@
+@@ -0,0 +1,59 @@
 +
 +config BCACHEFS_FS
 +	tristate "bcachefs filesystem support"
@@ -1260,6 +1340,7 @@ index 000000000..27742ce27
 +	select XOR_BLOCKS
 +	select XXHASH
 +	select SRCU
++	select SYMBOLIC_ERRNAME
 +	help
 +	The bcachefs filesystem - a modern, copy on write filesystem, with
 +	support for multiple devices, compression, checksumming, etc.
@@ -1288,12 +1369,18 @@ index 000000000..27742ce27
 +	depends on BCACHEFS_FS
 +	help
 +	Include some unit and performance tests for the core btree code
++
++config BCACHEFS_LOCK_TIME_STATS
++       bool "bcachefs lock time statistics"
++       depends on BCACHEFS_FS
++       help
++       Expose statistics for how long we held a lock in debugfs
 diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
 new file mode 100644
-index 000000000..d68aaf1a2
+index 000000000000..5dad8ed03a20
 --- /dev/null
 +++ b/fs/bcachefs/Makefile
-@@ -0,0 +1,68 @@
+@@ -0,0 +1,69 @@
 +
 +obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
 +
@@ -1324,6 +1411,7 @@ index 000000000..d68aaf1a2
 +	disk_groups.o		\
 +	data_update.o		\
 +	ec.o			\
++	errcode.o		\
 +	error.o			\
 +	extents.o		\
 +	extent_update.o		\
@@ -1364,7 +1452,7 @@ index 000000000..d68aaf1a2
 +bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o
 diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
 new file mode 100644
-index 000000000..5070caf8f
+index 000000000000..5c6ccf685094
 --- /dev/null
 +++ b/fs/bcachefs/acl.c
 @@ -0,0 +1,406 @@
@@ -1606,7 +1694,7 @@ index 000000000..5070caf8f
 +			&X_SEARCH(acl_to_xattr_type(type), "", 0),
 +			0);
 +	if (ret) {
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			goto retry;
 +		if (ret != -ENOENT)
 +			acl = ERR_PTR(ret);
@@ -1705,7 +1793,7 @@ index 000000000..5070caf8f
 +btree_err:
 +	bch2_trans_iter_exit(&trans, &inode_iter);
 +
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +	if (unlikely(ret))
 +		goto err;
@@ -1776,7 +1864,7 @@ index 000000000..5070caf8f
 +#endif /* CONFIG_BCACHEFS_POSIX_ACL */
 diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
 new file mode 100644
-index 000000000..2d76a4897
+index 000000000000..2d76a4897ba8
 --- /dev/null
 +++ b/fs/bcachefs/acl.h
 @@ -0,0 +1,58 @@
@@ -1840,10 +1928,10 @@ index 000000000..2d76a4897
 +#endif /* _BCACHEFS_ACL_H */
 diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
 new file mode 100644
-index 000000000..738567173
+index 000000000000..cd6cbd2064ee
 --- /dev/null
 +++ b/fs/bcachefs/alloc_background.c
-@@ -0,0 +1,1600 @@
+@@ -0,0 +1,1552 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "alloc_background.h"
@@ -2389,7 +2477,7 @@ index 000000000..738567173
 +	bch2_trans_exit(&trans);
 +
 +	if (ret)
-+		bch_err(c, "error reading alloc info: %i", ret);
++		bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
 +
 +	return ret;
 +}
@@ -2577,12 +2665,13 @@ index 000000000..738567173
 +	if (ret)
 +		goto err;
 +
-+	if (fsck_err_on(k.k->type != discard_key_type, c,
-+			"incorrect key in need_discard btree (got %s should be %s)\n"
-+			"  %s",
-+			bch2_bkey_types[k.k->type],
-+			bch2_bkey_types[discard_key_type],
-+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
++	if (k.k->type != discard_key_type &&
++	    (c->opts.reconstruct_alloc ||
++	     fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
++		      "  %s",
++		      bch2_bkey_types[k.k->type],
++		      bch2_bkey_types[discard_key_type],
++		      (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
 +		struct bkey_i *update =
 +			bch2_trans_kmalloc(trans, sizeof(*update));
 +
@@ -2604,13 +2693,14 @@ index 000000000..738567173
 +	if (ret)
 +		goto err;
 +
-+	if (fsck_err_on(k.k->type != freespace_key_type, c,
-+			"incorrect key in freespace btree (got %s should be %s)\n"
-+			"  %s",
-+			bch2_bkey_types[k.k->type],
-+			bch2_bkey_types[freespace_key_type],
-+			(printbuf_reset(&buf),
-+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
++	if (k.k->type != freespace_key_type &&
++	    (c->opts.reconstruct_alloc ||
++	     fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
++		      "  %s",
++		      bch2_bkey_types[k.k->type],
++		      bch2_bkey_types[freespace_key_type],
++		      (printbuf_reset(&buf),
++		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
 +		struct bkey_i *update =
 +			bch2_trans_kmalloc(trans, sizeof(*update));
 +
@@ -2638,7 +2728,7 @@ index 000000000..738567173
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter alloc_iter;
-+	struct bkey_s_c k, freespace_k;
++	struct bkey_s_c alloc_k;
 +	struct bch_alloc_v4 a;
 +	u64 genbits;
 +	struct bpos pos;
@@ -2648,14 +2738,6 @@ index 000000000..738567173
 +	struct printbuf buf = PRINTBUF;
 +	int ret;
 +
-+	freespace_k = bch2_btree_iter_peek(iter);
-+	if (!freespace_k.k)
-+		return 1;
-+
-+	ret = bkey_err(freespace_k);
-+	if (ret)
-+		return ret;
-+
 +	pos = iter->pos;
 +	pos.offset &= ~(~0ULL << 56);
 +	genbits = iter->pos.offset & (~0ULL << 56);
@@ -2667,18 +2749,18 @@ index 000000000..738567173
 +			bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
 +		goto delete;
 +
-+	k = bch2_btree_iter_peek_slot(&alloc_iter);
-+	ret = bkey_err(k);
++	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
++	ret = bkey_err(alloc_k);
 +	if (ret)
 +		goto err;
 +
-+	bch2_alloc_to_v4(k, &a);
++	bch2_alloc_to_v4(alloc_k, &a);
 +
 +	if (fsck_err_on(a.data_type != state ||
 +			(state == BCH_DATA_free &&
 +			 genbits != alloc_freespace_genbits(a)), c,
 +			"%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
-+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
++			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
 +			bch2_btree_ids[iter->btree_id],
 +			a.data_type == state,
 +			genbits >> 56, alloc_freespace_genbits(a) >> 56))
@@ -2699,6 +2781,7 @@ index 000000000..738567173
 +{
 +	struct btree_trans trans;
 +	struct btree_iter iter, discard_iter, freespace_iter;
++	struct bkey_s_c k;
 +	int ret = 0;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
@@ -2710,7 +2793,7 @@ index 000000000..738567173
 +	bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
 +			     BTREE_ITER_PREFETCH);
 +	while (1) {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
++		ret = commit_do(&trans, NULL, NULL,
 +				      BTREE_INSERT_NOFAIL|
 +				      BTREE_INSERT_LAZY_RW,
 +			bch2_check_alloc_key(&trans, &iter,
@@ -2728,36 +2811,16 @@ index 000000000..738567173
 +	if (ret < 0)
 +		goto err;
 +
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN,
-+			     BTREE_ITER_PREFETCH);
-+	while (1) {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_NOFAIL|
-+				      BTREE_INSERT_LAZY_RW,
-+			bch2_check_discard_freespace_key(&trans, &iter));
-+		if (ret)
-+			break;
-+
-+		bch2_btree_iter_advance(&iter);
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	if (ret < 0)
-+		goto err;
-+
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
-+			     BTREE_ITER_PREFETCH);
-+	while (1) {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_NOFAIL|
-+				      BTREE_INSERT_LAZY_RW,
-+			bch2_check_discard_freespace_key(&trans, &iter));
-+		if (ret)
-+			break;
-+
-+		bch2_btree_iter_advance(&iter);
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_need_discard, POS_MIN,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		bch2_check_discard_freespace_key(&trans, &iter)) ?:
++	      for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_freespace, POS_MIN,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		bch2_check_discard_freespace_key(&trans, &iter));
 +err:
 +	bch2_trans_exit(&trans);
 +	return ret < 0 ? ret : 0;
@@ -2851,32 +2914,53 @@ index 000000000..738567173
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-+			   BTREE_ITER_PREFETCH, k, ret) {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_NOFAIL|
-+				      BTREE_INSERT_LAZY_RW,
-+			bch2_check_alloc_to_lru_ref(&trans, &iter));
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
++	for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
++			POS_MIN, BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		bch2_check_alloc_to_lru_ref(&trans, &iter));
 +
 +	bch2_trans_exit(&trans);
 +	return ret < 0 ? ret : 0;
 +}
 +
-+static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
-+				   struct bch_dev *ca, bool *discard_done)
++static int bch2_discard_one_bucket(struct btree_trans *trans,
++				   struct btree_iter *need_discard_iter,
++				   struct bpos *discard_pos_done,
++				   u64 *seen,
++				   u64 *open,
++				   u64 *need_journal_commit,
++				   u64 *discarded)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct btree_iter iter;
++	struct bpos pos = need_discard_iter->pos;
++	struct btree_iter iter = { NULL };
 +	struct bkey_s_c k;
++	struct bch_dev *ca;
 +	struct bkey_i_alloc_v4 *a;
 +	struct printbuf buf = PRINTBUF;
-+	int ret;
++	bool did_discard = false;
++	int ret = 0;
 +
-+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos,
++	ca = bch_dev_bkey_exists(c, pos.inode);
++	if (!percpu_ref_tryget(&ca->io_ref)) {
++		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
++		return 0;
++	}
++
++	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
++		(*open)++;
++		goto out;
++	}
++
++	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
++			c->journal.flushed_seq_ondisk,
++			pos.inode, pos.offset)) {
++		(*need_journal_commit)++;
++		goto out;
++	}
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
++			     need_discard_iter->pos,
 +			     BTREE_ITER_CACHED);
 +	k = bch2_btree_iter_peek_slot(&iter);
 +	ret = bkey_err(k);
@@ -2912,7 +2996,8 @@ index 000000000..738567173
 +		goto out;
 +	}
 +
-+	if (!*discard_done && ca->mi.discard && !c->opts.nochanges) {
++	if (bkey_cmp(*discard_pos_done, iter.pos) &&
++	    ca->mi.discard && !c->opts.nochanges) {
 +		/*
 +		 * This works without any other locks because this is the only
 +		 * thread that removes items from the need_discard tree
@@ -2922,19 +3007,31 @@ index 000000000..738567173
 +				     k.k->p.offset * ca->mi.bucket_size,
 +				     ca->mi.bucket_size,
 +				     GFP_KERNEL, 0);
-+		*discard_done = true;
 +
-+		ret = bch2_trans_relock(trans) ? 0 : -EINTR;
++		ret = bch2_trans_relock(trans);
 +		if (ret)
 +			goto out;
 +	}
 +
++	*discard_pos_done = iter.pos;
++	did_discard = true;
++
 +	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
 +	a->v.data_type = alloc_data_type(a->v, a->v.data_type);
 +write:
-+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
++	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
++		bch2_trans_commit(trans, NULL, NULL,
++				  BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
++	if (ret)
++		goto out;
++
++	if (did_discard) {
++		this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
++		(*discarded)++;
++	}
 +out:
 +	bch2_trans_iter_exit(trans, &iter);
++	percpu_ref_put(&ca->io_ref);
 +	printbuf_exit(&buf);
 +	return ret;
 +}
@@ -2942,61 +3039,27 @@ index 000000000..738567173
 +static void bch2_do_discards_work(struct work_struct *work)
 +{
 +	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
-+	struct bch_dev *ca = NULL;
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
++	struct bpos discard_pos_done = POS_MAX;
 +	int ret;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_need_discard,
-+			   POS_MIN, 0, k, ret) {
-+		bool discard_done = false;
-+
-+		if (ca && k.k->p.inode != ca->dev_idx) {
-+			percpu_ref_put(&ca->io_ref);
-+			ca = NULL;
-+		}
-+
-+		if (!ca) {
-+			ca = bch_dev_bkey_exists(c, k.k->p.inode);
-+			if (!percpu_ref_tryget(&ca->io_ref)) {
-+				ca = NULL;
-+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
-+				continue;
-+			}
-+		}
-+
-+		seen++;
-+
-+		if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) {
-+			open++;
-+			continue;
-+		}
-+
-+		if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-+				c->journal.flushed_seq_ondisk,
-+				k.k->p.inode, k.k->p.offset)) {
-+			need_journal_commit++;
-+			continue;
-+		}
-+
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_USE_RESERVE|
-+				      BTREE_INSERT_NOFAIL,
-+				bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
-+		if (ret)
-+			break;
-+
-+		this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
-+		discarded++;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	if (ca)
-+		percpu_ref_put(&ca->io_ref);
++	/*
++	 * We're doing the commit in bch2_discard_one_bucket instead of using
++	 * for_each_btree_key_commit() so that we can increment counters after
++	 * successful commit:
++	 */
++	ret = for_each_btree_key2(&trans, iter,
++			BTREE_ID_need_discard, POS_MIN, 0, k,
++		bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
++					&seen,
++					&open,
++					&need_journal_commit,
++					&discarded));
 +
 +	bch2_trans_exit(&trans);
 +
@@ -3005,7 +3068,8 @@ index 000000000..738567173
 +
 +	percpu_ref_put(&c->writes);
 +
-+	trace_discard_buckets(c, seen, open, need_journal_commit, discarded, ret);
++	trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
++			      bch2_err_str(ret));
 +}
 +
 +void bch2_do_discards(struct bch_fs *c)
@@ -3015,29 +3079,20 @@ index 000000000..738567173
 +		percpu_ref_put(&c->writes);
 +}
 +
-+static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca,
-+				 struct bpos *bucket_pos, unsigned *cached_sectors)
++static int invalidate_one_bucket(struct btree_trans *trans,
++				 struct btree_iter *lru_iter, struct bkey_s_c k,
++				 unsigned dev_idx, s64 *nr_to_invalidate)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct btree_iter lru_iter, alloc_iter = { NULL };
-+	struct bkey_s_c k;
++	struct btree_iter alloc_iter = { NULL };
 +	struct bkey_i_alloc_v4 *a;
-+	u64 bucket, idx;
++	struct bpos bucket;
 +	struct printbuf buf = PRINTBUF;
-+	int ret;
++	unsigned cached_sectors;
++	int ret = 0;
 +
-+	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
-+			     POS(ca->dev_idx, 0), 0);
-+next_lru:
-+	k = bch2_btree_iter_peek(&lru_iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		goto out;
-+
-+	if (!k.k || k.k->p.inode != ca->dev_idx) {
-+		ret = 1;
-+		goto out;
-+	}
++	if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx)
++		return 1;
 +
 +	if (k.k->type != KEY_TYPE_lru) {
 +		prt_printf(&buf, "non lru key in lru btree:\n  ");
@@ -3045,26 +3100,22 @@ index 000000000..738567173
 +
 +		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
 +			bch_err(c, "%s", buf.buf);
-+			bch2_btree_iter_advance(&lru_iter);
-+			goto next_lru;
 +		} else {
 +			bch2_trans_inconsistent(trans, "%s", buf.buf);
 +			ret = -EINVAL;
-+			goto out;
 +		}
++
++		goto out;
 +	}
 +
-+	idx	= k.k->p.offset;
-+	bucket	= le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
++	bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx));
 +
-+	*bucket_pos = POS(ca->dev_idx, bucket);
-+
-+	a = bch2_trans_start_alloc_update(trans, &alloc_iter, *bucket_pos);
++	a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
 +	ret = PTR_ERR_OR_ZERO(a);
 +	if (ret)
 +		goto out;
 +
-+	if (idx != alloc_lru_idx(a->v)) {
++	if (k.k->p.offset != alloc_lru_idx(a->v)) {
 +		prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
 +		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
 +		prt_printf(&buf, "\n  ");
@@ -3072,19 +3123,18 @@ index 000000000..738567173
 +
 +		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
 +			bch_err(c, "%s", buf.buf);
-+			bch2_btree_iter_advance(&lru_iter);
-+			goto next_lru;
 +		} else {
 +			bch2_trans_inconsistent(trans, "%s", buf.buf);
 +			ret = -EINVAL;
-+			goto out;
 +		}
++
++		goto out;
 +	}
 +
 +	if (!a->v.cached_sectors)
 +		bch_err(c, "invalidating empty bucket, confused");
 +
-+	*cached_sectors = a->v.cached_sectors;
++	cached_sectors = a->v.cached_sectors;
 +
 +	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
 +	a->v.gen++;
@@ -3094,13 +3144,18 @@ index 000000000..738567173
 +	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
 +	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
 +
-+	ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
-+				BTREE_TRIGGER_BUCKET_INVALIDATE);
++	ret =   bch2_trans_update(trans, &alloc_iter, &a->k_i,
++				BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
++		bch2_trans_commit(trans, NULL, NULL,
++				  BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
 +	if (ret)
 +		goto out;
++
++	trace_invalidate_bucket(c, bucket.inode, bucket.offset, cached_sectors);
++	this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
++	--*nr_to_invalidate;
 +out:
 +	bch2_trans_iter_exit(trans, &alloc_iter);
-+	bch2_trans_iter_exit(trans, &lru_iter);
 +	printbuf_exit(&buf);
 +	return ret;
 +}
@@ -3110,8 +3165,9 @@ index 000000000..738567173
 +	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
 +	struct bch_dev *ca;
 +	struct btree_trans trans;
-+	struct bpos bucket;
-+	unsigned i, sectors;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	unsigned i;
 +	int ret = 0;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
@@ -3120,17 +3176,13 @@ index 000000000..738567173
 +		s64 nr_to_invalidate =
 +			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 +
-+		while (nr_to_invalidate-- >= 0) {
-+			ret = __bch2_trans_do(&trans, NULL, NULL,
-+					      BTREE_INSERT_USE_RESERVE|
-+					      BTREE_INSERT_NOFAIL,
-+					invalidate_one_bucket(&trans, ca, &bucket,
-+							      &sectors));
-+			if (ret)
-+				break;
++		ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru,
++				POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k,
++			invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate));
 +
-+			trace_invalidate_bucket(c, bucket.inode, bucket.offset, sectors);
-+			this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
++		if (ret < 0) {
++			percpu_ref_put(&ca->ref);
++			break;
 +		}
 +	}
 +
@@ -3145,16 +3197,13 @@ index 000000000..738567173
 +		percpu_ref_put(&c->writes);
 +}
 +
-+static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter)
++static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter,
++				 struct bkey_s_c k, struct bch_dev *ca)
 +{
 +	struct bch_alloc_v4 a;
-+	struct bkey_s_c k;
-+	int ret;
 +
-+	k = bch2_btree_iter_peek_slot(iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		return ret;
++	if (iter->pos.offset >= ca->mi.nbuckets)
++		return 1;
 +
 +	bch2_alloc_to_v4(k, &a);
 +	return bch2_bucket_do_index(trans, k, &a, true);
@@ -3170,25 +3219,16 @@ index 000000000..738567173
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_alloc,
-+			   POS(ca->dev_idx, ca->mi.first_bucket),
-+			   BTREE_ITER_SLOTS|
-+			   BTREE_ITER_PREFETCH, k, ret) {
-+		if (iter.pos.offset >= ca->mi.nbuckets)
-+			break;
-+
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_LAZY_RW,
-+				 bucket_freespace_init(&trans, &iter));
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
++			POS(ca->dev_idx, ca->mi.first_bucket),
++			BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW,
++		bucket_freespace_init(&trans, &iter, k, ca));
 +
 +	bch2_trans_exit(&trans);
 +
-+	if (ret) {
-+		bch_err(ca, "error initializing free space: %i", ret);
++	if (ret < 0) {
++		bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
 +		return ret;
 +	}
 +
@@ -3197,7 +3237,7 @@ index 000000000..738567173
 +	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
 +	mutex_unlock(&c->sb_lock);
 +
-+	return ret;
++	return 0;
 +}
 +
 +int bch2_fs_freespace_init(struct bch_fs *c)
@@ -3446,10 +3486,10 @@ index 000000000..738567173
 +}
 diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
 new file mode 100644
-index 000000000..2ac6b5046
+index 000000000000..044bc72992d4
 --- /dev/null
 +++ b/fs/bcachefs/alloc_background.h
-@@ -0,0 +1,181 @@
+@@ -0,0 +1,183 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
 +#define _BCACHEFS_ALLOC_BACKGROUND_H
@@ -3602,11 +3642,13 @@ index 000000000..2ac6b5046
 +static inline u64 should_invalidate_buckets(struct bch_dev *ca,
 +					    struct bch_dev_usage u)
 +{
-+	u64 free = u.d[BCH_DATA_free].buckets +
-+		u.d[BCH_DATA_need_discard].buckets;
++	u64 want_free = ca->mi.nbuckets >> 7;
++	u64 free = max_t(s64, 0,
++			   u.d[BCH_DATA_free].buckets
++			 + u.d[BCH_DATA_need_discard].buckets
++			 - bch2_dev_buckets_reserved(ca, RESERVE_none));
 +
-+	return clamp_t(s64, (ca->mi.nbuckets >> 7) - free,
-+		       0, u.d[BCH_DATA_cached].buckets);
++	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
 +}
 +
 +void bch2_do_invalidates(struct bch_fs *);
@@ -3633,10 +3675,10 @@ index 000000000..2ac6b5046
 +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
 diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
 new file mode 100644
-index 000000000..7a878a690
+index 000000000000..6e52230e69e1
 --- /dev/null
 +++ b/fs/bcachefs/alloc_foreground.c
-@@ -0,0 +1,1282 @@
+@@ -0,0 +1,1380 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2012 Google, Inc.
@@ -3665,6 +3707,7 @@ index 000000000..7a878a690
 +#include "error.h"
 +#include "io.h"
 +#include "journal.h"
++#include "movinggc.h"
 +
 +#include <linux/math64.h>
 +#include <linux/rculist.h>
@@ -3865,7 +3908,7 @@ index 000000000..7a878a690
 +			c->blocked_allocate_open_bucket = local_clock();
 +
 +		spin_unlock(&c->freelist_lock);
-+		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
++		return ERR_PTR(-BCH_ERR_open_buckets_empty);
 +	}
 +
 +	/* Recheck under lock: */
@@ -3977,7 +4020,10 @@ index 000000000..7a878a690
 +				skipped_need_journal_commit,
 +				skipped_nouse,
 +				cl);
++	if (!ob)
++		iter.path->preserve = false;
 +err:
++	set_btree_iter_dontneed(&iter);
 +	bch2_trans_iter_exit(trans, &iter);
 +	printbuf_exit(&buf);
 +	return ob;
@@ -4016,15 +4062,15 @@ index 000000000..7a878a690
 + * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
 + */
 +static noinline struct open_bucket *
-+bch2_bucket_alloc_trans_early(struct btree_trans *trans,
-+			      struct bch_dev *ca,
-+			      enum alloc_reserve reserve,
-+			      u64 *cur_bucket,
-+			      u64 *buckets_seen,
-+			      u64 *skipped_open,
-+			      u64 *skipped_need_journal_commit,
-+			      u64 *skipped_nouse,
-+			      struct closure *cl)
++bch2_bucket_alloc_early(struct btree_trans *trans,
++			struct bch_dev *ca,
++			enum alloc_reserve reserve,
++			u64 *cur_bucket,
++			u64 *buckets_seen,
++			u64 *skipped_open,
++			u64 *skipped_need_journal_commit,
++			u64 *skipped_nouse,
++			struct closure *cl)
 +{
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
@@ -4034,7 +4080,7 @@ index 000000000..7a878a690
 +	*cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket);
 +	*cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx);
 +
-+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
++	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
 +			   BTREE_ITER_SLOTS, k, ret) {
 +		struct bch_alloc_v4 a;
 +
@@ -4064,10 +4110,10 @@ index 000000000..7a878a690
 +
 +	*cur_bucket = iter.pos.offset;
 +
-+	return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
++	return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
 +}
 +
-+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
++static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
 +						   struct bch_dev *ca,
 +						   enum alloc_reserve reserve,
 +						   u64 *cur_bucket,
@@ -4082,29 +4128,24 @@ index 000000000..7a878a690
 +	struct open_bucket *ob = NULL;
 +	int ret;
 +
-+	if (unlikely(!ca->mi.freespace_initialized))
-+		return bch2_bucket_alloc_trans_early(trans, ca, reserve,
-+						     cur_bucket,
-+						     buckets_seen,
-+						     skipped_open,
-+						     skipped_need_journal_commit,
-+						     skipped_nouse,
-+						     cl);
-+
 +	BUG_ON(ca->new_fs_bucket_idx);
 +
++	/*
++	 * XXX:
++	 * On transaction restart, we'd like to restart from the bucket we were
++	 * at previously
++	 */
 +	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
 +				     POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
 +		if (k.k->p.inode != ca->dev_idx)
 +			break;
 +
 +		for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
-+		     *cur_bucket < k.k->p.offset && !ob;
++		     *cur_bucket < k.k->p.offset;
 +		     (*cur_bucket)++) {
-+			if (btree_trans_too_many_iters(trans)) {
-+				ob = ERR_PTR(-EINTR);
++			ret = btree_trans_too_many_iters(trans);
++			if (ret)
 +				break;
-+			}
 +
 +			(*buckets_seen)++;
 +
@@ -4114,8 +4155,11 @@ index 000000000..7a878a690
 +					      skipped_need_journal_commit,
 +					      skipped_nouse,
 +					      k, cl);
++			if (ob)
++				break;
 +		}
-+		if (ob)
++
++		if (ob || ret)
 +			break;
 +	}
 +	bch2_trans_iter_exit(trans, &iter);
@@ -4128,15 +4172,19 @@ index 000000000..7a878a690
 + *
 + * Returns index of bucket on success, 0 on failure
 + * */
-+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
++static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
++				      struct bch_dev *ca,
 +				      enum alloc_reserve reserve,
 +				      bool may_alloc_partial,
 +				      struct closure *cl)
 +{
++	struct bch_fs *c = trans->c;
 +	struct open_bucket *ob = NULL;
 +	struct bch_dev_usage usage;
++	bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized);
++	u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor;
 +	u64 avail;
-+	u64 cur_bucket = 0;
++	u64 cur_bucket = start;
 +	u64 buckets_seen = 0;
 +	u64 skipped_open = 0;
 +	u64 skipped_need_journal_commit = 0;
@@ -4145,7 +4193,7 @@ index 000000000..7a878a690
 +	int ret;
 +again:
 +	usage = bch2_dev_usage_read(ca);
-+	avail = dev_buckets_free(ca, usage,reserve);
++	avail = dev_buckets_free(ca, usage, reserve);
 +
 +	if (usage.d[BCH_DATA_need_discard].buckets > avail)
 +		bch2_do_discards(c);
@@ -4166,7 +4214,7 @@ index 000000000..7a878a690
 +		if (!c->blocked_allocate)
 +			c->blocked_allocate = local_clock();
 +
-+		ob = ERR_PTR(-FREELIST_EMPTY);
++		ob = ERR_PTR(-BCH_ERR_freelist_empty);
 +		goto err;
 +	}
 +
@@ -4179,34 +4227,67 @@ index 000000000..7a878a690
 +			return ob;
 +	}
 +
-+	ret = bch2_trans_do(c, NULL, NULL, 0,
-+			PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
-+							&cur_bucket,
-+							&buckets_seen,
-+							&skipped_open,
-+							&skipped_need_journal_commit,
-+							&skipped_nouse,
-+							cl)));
++	ob = likely(ca->mi.freespace_initialized)
++		? bch2_bucket_alloc_freelist(trans, ca, reserve,
++					&cur_bucket,
++					&buckets_seen,
++					&skipped_open,
++					&skipped_need_journal_commit,
++					&skipped_nouse,
++					cl)
++		: bch2_bucket_alloc_early(trans, ca, reserve,
++					&cur_bucket,
++					&buckets_seen,
++					&skipped_open,
++					&skipped_need_journal_commit,
++					&skipped_nouse,
++					cl);
 +
 +	if (skipped_need_journal_commit * 2 > avail)
 +		bch2_journal_flush_async(&c->journal, NULL);
++
++	if (!ob && !ret && !freespace_initialized && start) {
++		start = cur_bucket = 0;
++		goto again;
++	}
++
++	if (!freespace_initialized)
++		ca->bucket_alloc_trans_early_cursor = cur_bucket;
 +err:
 +	if (!ob)
-+		ob = ERR_PTR(ret ?: -FREELIST_EMPTY);
++		ob = ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
 +
 +	if (IS_ERR(ob)) {
-+		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail,
++		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve],
++					usage.d[BCH_DATA_free].buckets,
++					avail,
++					bch2_copygc_wait_amount(c),
++					c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
 +					buckets_seen,
 +					skipped_open,
 +					skipped_need_journal_commit,
 +					skipped_nouse,
-+					cl == NULL, PTR_ERR(ob));
++					cl == NULL,
++					bch2_err_str(PTR_ERR(ob)));
 +		atomic_long_inc(&c->bucket_alloc_fail);
 +	}
 +
 +	return ob;
 +}
 +
++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
++				      enum alloc_reserve reserve,
++				      bool may_alloc_partial,
++				      struct closure *cl)
++{
++	struct open_bucket *ob;
++
++	bch2_trans_do(c, NULL, NULL, 0,
++		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
++								   may_alloc_partial, cl)));
++	return ob;
++}
++
 +static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
 +			    unsigned l, unsigned r)
 +{
@@ -4272,7 +4353,7 @@ index 000000000..7a878a690
 +	ob_push(c, ptrs, ob);
 +}
 +
-+int bch2_bucket_alloc_set(struct bch_fs *c,
++static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 +		      struct open_buckets *ptrs,
 +		      struct dev_stripe_state *stripe,
 +		      struct bch_devs_mask *devs_may_alloc,
@@ -4283,11 +4364,12 @@ index 000000000..7a878a690
 +		      unsigned flags,
 +		      struct closure *cl)
 +{
++	struct bch_fs *c = trans->c;
 +	struct dev_alloc_list devs_sorted =
 +		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
 +	unsigned dev;
 +	struct bch_dev *ca;
-+	int ret = -INSUFFICIENT_DEVICES;
++	int ret = -BCH_ERR_insufficient_devices;
 +	unsigned i;
 +
 +	BUG_ON(*nr_effective >= nr_replicas);
@@ -4311,7 +4393,7 @@ index 000000000..7a878a690
 +			continue;
 +		}
 +
-+		ob = bch2_bucket_alloc(c, ca, reserve,
++		ob = bch2_bucket_alloc_trans(trans, ca, reserve,
 +				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
 +		if (!IS_ERR(ob))
 +			bch2_dev_stripe_increment(ca, stripe);
@@ -4319,8 +4401,7 @@ index 000000000..7a878a690
 +
 +		if (IS_ERR(ob)) {
 +			ret = PTR_ERR(ob);
-+
-+			if (cl)
++			if (ret == -EINTR || cl)
 +				break;
 +			continue;
 +		}
@@ -4337,6 +4418,24 @@ index 000000000..7a878a690
 +	return ret;
 +}
 +
++int bch2_bucket_alloc_set(struct bch_fs *c,
++		      struct open_buckets *ptrs,
++		      struct dev_stripe_state *stripe,
++		      struct bch_devs_mask *devs_may_alloc,
++		      unsigned nr_replicas,
++		      unsigned *nr_effective,
++		      bool *have_cache,
++		      enum alloc_reserve reserve,
++		      unsigned flags,
++		      struct closure *cl)
++{
++	return bch2_trans_do(c, NULL, NULL, 0,
++		      bch2_bucket_alloc_set_trans(&trans, ptrs, stripe,
++					      devs_may_alloc, nr_replicas,
++					      nr_effective, have_cache, reserve,
++					      flags, cl));
++}
++
 +/* Allocate from stripes: */
 +
 +/*
@@ -4441,7 +4540,7 @@ index 000000000..7a878a690
 +	wp->ptrs = ptrs_skip;
 +}
 +
-+static int open_bucket_add_buckets(struct bch_fs *c,
++static int open_bucket_add_buckets(struct btree_trans *trans,
 +			struct open_buckets *ptrs,
 +			struct write_point *wp,
 +			struct bch_devs_list *devs_have,
@@ -4454,6 +4553,7 @@ index 000000000..7a878a690
 +			unsigned flags,
 +			struct closure *_cl)
 +{
++	struct bch_fs *c = trans->c;
 +	struct bch_devs_mask devs;
 +	struct open_bucket *ob;
 +	struct closure *cl = NULL;
@@ -4485,8 +4585,9 @@ index 000000000..7a878a690
 +						 target, erasure_code,
 +						 nr_replicas, nr_effective,
 +						 have_cache, flags, _cl);
-+			if (ret == -FREELIST_EMPTY ||
-+			    ret == -OPEN_BUCKETS_EMPTY)
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
++			    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
++			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
 +				return ret;
 +			if (*nr_effective >= nr_replicas)
 +				return 0;
@@ -4504,10 +4605,13 @@ index 000000000..7a878a690
 +	 * Try nonblocking first, so that if one device is full we'll try from
 +	 * other devices:
 +	 */
-+	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
++	ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
 +				nr_replicas, nr_effective, have_cache,
 +				reserve, flags, cl);
-+	if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
++	if (ret &&
++	    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
++	    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
++	    !cl && _cl) {
 +		cl = _cl;
 +		goto retry_blocking;
 +	}
@@ -4625,15 +4729,25 @@ index 000000000..7a878a690
 +	return true;
 +}
 +
-+static struct write_point *writepoint_find(struct bch_fs *c,
++static void bch2_trans_mutex_lock(struct btree_trans *trans,
++				  struct mutex *lock)
++{
++	if (!mutex_trylock(lock)) {
++		bch2_trans_unlock(trans);
++		mutex_lock(lock);
++	}
++}
++
++static struct write_point *writepoint_find(struct btree_trans *trans,
 +					   unsigned long write_point)
 +{
++	struct bch_fs *c = trans->c;
 +	struct write_point *wp, *oldest;
 +	struct hlist_head *head;
 +
 +	if (!(write_point & 1UL)) {
 +		wp = (struct write_point *) write_point;
-+		mutex_lock(&wp->lock);
++		bch2_trans_mutex_lock(trans, &wp->lock);
 +		return wp;
 +	}
 +
@@ -4642,7 +4756,7 @@ index 000000000..7a878a690
 +	wp = __writepoint_find(head, write_point);
 +	if (wp) {
 +lock_wp:
-+		mutex_lock(&wp->lock);
++		bch2_trans_mutex_lock(trans, &wp->lock);
 +		if (wp->write_point == write_point)
 +			goto out;
 +		mutex_unlock(&wp->lock);
@@ -4655,8 +4769,8 @@ index 000000000..7a878a690
 +		if (!oldest || time_before64(wp->last_used, oldest->last_used))
 +			oldest = wp;
 +
-+	mutex_lock(&oldest->lock);
-+	mutex_lock(&c->write_points_hash_lock);
++	bch2_trans_mutex_lock(trans, &oldest->lock);
++	bch2_trans_mutex_lock(trans, &c->write_points_hash_lock);
 +	if (oldest >= c->write_points + c->write_points_nr ||
 +	    try_increase_writepoints(c)) {
 +		mutex_unlock(&c->write_points_hash_lock);
@@ -4684,7 +4798,7 @@ index 000000000..7a878a690
 +/*
 + * Get us an open_bucket we can allocate from, return with it locked:
 + */
-+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
++struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 +				unsigned target,
 +				unsigned erasure_code,
 +				struct write_point_specifier write_point,
@@ -4695,6 +4809,7 @@ index 000000000..7a878a690
 +				unsigned flags,
 +				struct closure *cl)
 +{
++	struct bch_fs *c = trans->c;
 +	struct write_point *wp;
 +	struct open_bucket *ob;
 +	struct open_buckets ptrs;
@@ -4714,7 +4829,7 @@ index 000000000..7a878a690
 +	write_points_nr = c->write_points_nr;
 +	have_cache	= false;
 +
-+	wp = writepoint_find(c, write_point.v);
++	wp = writepoint_find(trans, write_point.v);
 +
 +	if (wp->data_type == BCH_DATA_user)
 +		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -4724,21 +4839,21 @@ index 000000000..7a878a690
 +		have_cache = true;
 +
 +	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
++		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 +					      target, erasure_code,
 +					      nr_replicas, &nr_effective,
 +					      &have_cache, reserve,
 +					      ob_flags, cl);
 +	} else {
-+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
++		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 +					      target, erasure_code,
 +					      nr_replicas, &nr_effective,
 +					      &have_cache, reserve,
 +					      ob_flags, NULL);
-+		if (!ret)
++		if (!ret || ret == -EINTR)
 +			goto alloc_done;
 +
-+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
++		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 +					      0, erasure_code,
 +					      nr_replicas, &nr_effective,
 +					      &have_cache, reserve,
@@ -4750,7 +4865,7 @@ index 000000000..7a878a690
 +	if (erasure_code && !ec_open_bucket(c, &ptrs))
 +		pr_debug("failed to get ec bucket: ret %u", ret);
 +
-+	if (ret == -INSUFFICIENT_DEVICES &&
++	if (ret == -BCH_ERR_insufficient_devices &&
 +	    nr_effective >= nr_replicas_required)
 +		ret = 0;
 +
@@ -4781,19 +4896,44 @@ index 000000000..7a878a690
 +
 +	mutex_unlock(&wp->lock);
 +
-+	if (ret == -FREELIST_EMPTY &&
++	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
 +	    try_decrease_writepoints(c, write_points_nr))
 +		goto retry;
 +
-+	switch (ret) {
-+	case -OPEN_BUCKETS_EMPTY:
-+	case -FREELIST_EMPTY:
++	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
++	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
 +		return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
-+	case -INSUFFICIENT_DEVICES:
++
++	if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
 +		return ERR_PTR(-EROFS);
-+	default:
-+		return ERR_PTR(ret);
-+	}
++
++	return ERR_PTR(ret);
++}
++
++struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
++				unsigned target,
++				unsigned erasure_code,
++				struct write_point_specifier write_point,
++				struct bch_devs_list *devs_have,
++				unsigned nr_replicas,
++				unsigned nr_replicas_required,
++				enum alloc_reserve reserve,
++				unsigned flags,
++				struct closure *cl)
++{
++	struct write_point *wp;
++
++	bch2_trans_do(c, NULL, NULL, 0,
++		      PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target,
++							erasure_code,
++							write_point,
++							devs_have,
++							nr_replicas,
++							nr_replicas_required,
++							reserve,
++							flags, cl)));
++	return wp;
++
 +}
 +
 +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
@@ -4921,10 +5061,10 @@ index 000000000..7a878a690
 +}
 diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
 new file mode 100644
-index 000000000..8bc78877f
+index 000000000000..6de63a351fa8
 --- /dev/null
 +++ b/fs/bcachefs/alloc_foreground.h
-@@ -0,0 +1,173 @@
+@@ -0,0 +1,181 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
 +#define _BCACHEFS_ALLOC_FOREGROUND_H
@@ -5063,6 +5203,14 @@ index 000000000..8bc78877f
 +		      unsigned, unsigned *, bool *, enum alloc_reserve,
 +		      unsigned, struct closure *);
 +
++struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *,
++					     unsigned, unsigned,
++					     struct write_point_specifier,
++					     struct bch_devs_list *,
++					     unsigned, unsigned,
++					     enum alloc_reserve,
++					     unsigned,
++					     struct closure *);
 +struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
 +					     unsigned, unsigned,
 +					     struct write_point_specifier,
@@ -5100,7 +5248,7 @@ index 000000000..8bc78877f
 +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
 diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
 new file mode 100644
-index 000000000..e078584d4
+index 000000000000..e078584d46f6
 --- /dev/null
 +++ b/fs/bcachefs/alloc_types.h
 @@ -0,0 +1,87 @@
@@ -5193,10 +5341,10 @@ index 000000000..e078584d4
 +#endif /* _BCACHEFS_ALLOC_TYPES_H */
 diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
 new file mode 100644
-index 000000000..f3260bbef
+index 000000000000..5a46b25b0587
 --- /dev/null
 +++ b/fs/bcachefs/backpointers.c
-@@ -0,0 +1,891 @@
+@@ -0,0 +1,875 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "alloc_background.h"
@@ -5642,8 +5790,8 @@ index 000000000..f3260bbef
 +		goto out;
 +	}
 +
-+	for_each_btree_key(trans, bp_iter, BTREE_ID_backpointers,
-+			   bp_pos, 0, k, ret) {
++	for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
++				     bp_pos, 0, k, ret) {
 +		if (bpos_cmp(k.k->p, bp_end_pos) >= 0)
 +			break;
 +
@@ -5768,22 +5916,16 @@ index 000000000..f3260bbef
 +	return NULL;
 +}
 +
-+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter)
++static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
++					struct bkey_s_c k)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter alloc_iter = { NULL };
 +	struct bch_dev *ca;
-+	struct bkey_s_c k, alloc_k;
++	struct bkey_s_c alloc_k;
 +	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
-+	k = bch2_btree_iter_peek(bp_iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		return ret;
-+	if (!k.k)
-+		return 0;
-+
 +	if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
 +			"backpointer for mising device:\n%s",
 +			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -5818,25 +5960,14 @@ index 000000000..f3260bbef
 +/* verify that every backpointer has a corresponding alloc key */
 +int bch2_check_btree_backpointers(struct bch_fs *c)
 +{
-+	struct btree_trans trans;
 +	struct btree_iter iter;
-+	int ret = 0;
++	struct bkey_s_c k;
 +
-+	bch2_trans_init(&trans, c, 0, 0);
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_backpointers, POS_MIN, 0);
-+
-+	do {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_LAZY_RW|
-+				      BTREE_INSERT_NOFAIL,
-+				      bch2_check_btree_backpointer(&trans, &iter));
-+		if (ret)
-+			break;
-+	} while (bch2_btree_iter_advance(&iter));
-+
-+	bch2_trans_iter_exit(&trans, &iter);
-+	bch2_trans_exit(&trans);
-+	return ret;
++	return bch2_trans_run(c,
++		for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_backpointers, POS_MIN, 0, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		  bch2_check_btree_backpointer(&trans, &iter, k)));
 +}
 +
 +static int check_bp_exists(struct btree_trans *trans,
@@ -5899,6 +6030,7 @@ index 000000000..f3260bbef
 +	bch2_bkey_val_to_text(&buf, c, alloc_k);
 +
 +	if (c->sb.version < bcachefs_metadata_version_backpointers ||
++	    c->opts.reconstruct_alloc ||
 +	    fsck_err(c, "%s", buf.buf)) {
 +		struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k);
 +
@@ -6003,7 +6135,7 @@ index 000000000..f3260bbef
 +					  BTREE_ITER_PREFETCH);
 +
 +		do {
-+			ret = __bch2_trans_do(&trans, NULL, NULL,
++			ret = commit_do(&trans, NULL, NULL,
 +					      BTREE_INSERT_LAZY_RW|
 +					      BTREE_INSERT_NOFAIL,
 +					      check_extent_to_backpointers(&trans, &iter));
@@ -6016,7 +6148,7 @@ index 000000000..f3260bbef
 +		if (ret)
 +			break;
 +
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
++		ret = commit_do(&trans, NULL, NULL,
 +				      BTREE_INSERT_LAZY_RW|
 +				      BTREE_INSERT_NOFAIL,
 +				      check_btree_root_to_backpointers(&trans, btree_id));
@@ -6074,7 +6206,7 @@ index 000000000..f3260bbef
 +			   BTREE_ITER_PREFETCH, k, ret) {
 +		u64 bp_offset = 0;
 +
-+		while (!(ret = __bch2_trans_do(&trans, NULL, NULL,
++		while (!(ret = commit_do(&trans, NULL, NULL,
 +					       BTREE_INSERT_LAZY_RW|
 +					       BTREE_INSERT_NOFAIL,
 +				check_one_backpointer(&trans, iter.pos, &bp_offset))) &&
@@ -6090,7 +6222,7 @@ index 000000000..f3260bbef
 +}
 diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
 new file mode 100644
-index 000000000..fe42af296
+index 000000000000..fe42af296e9c
 --- /dev/null
 +++ b/fs/bcachefs/backpointers.h
 @@ -0,0 +1,38 @@
@@ -6134,10 +6266,10 @@ index 000000000..fe42af296
 +#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
 diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
 new file mode 100644
-index 000000000..8b4d0eb5c
+index 000000000000..8ffdb4dee47a
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs.h
-@@ -0,0 +1,988 @@
+@@ -0,0 +1,1000 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_H
 +#define _BCACHEFS_H
@@ -6459,6 +6591,8 @@ index 000000000..8b4d0eb5c
 +#undef BCH_DEBUG_PARAM
 +#endif
 +
++#define BCH_LOCK_TIME_NR 128
++
 +#define BCH_TIME_STATS()			\
 +	x(btree_node_mem_alloc)			\
 +	x(btree_node_split)			\
@@ -6603,6 +6737,7 @@ index 000000000..8b4d0eb5c
 +
 +	/* Allocator: */
 +	u64			new_fs_bucket_idx;
++	u64			bucket_alloc_trans_early_cursor;
 +
 +	unsigned		nr_open_buckets;
 +	unsigned		nr_btree_reserve;
@@ -6655,6 +6790,8 @@ index 000000000..8b4d0eb5c
 +	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
 +	BCH_FS_NEED_ANOTHER_GC,
 +
++	BCH_FS_HAVE_DELETED_SNAPSHOTS,
++
 +	/* errors: */
 +	BCH_FS_ERROR,
 +	BCH_FS_TOPOLOGY_ERROR,
@@ -6666,6 +6803,11 @@ index 000000000..8b4d0eb5c
 +	unsigned		id;
 +};
 +
++struct lock_held_stats {
++	struct time_stats       times[BCH_LOCK_TIME_NR];
++	const char              *names[BCH_LOCK_TIME_NR];
++};
++
 +struct bch_fs_pcpu {
 +	u64			sectors_available;
 +};
@@ -7059,6 +7201,8 @@ index 000000000..8b4d0eb5c
 +	bool			promote_whole_extents;
 +
 +	struct time_stats	times[BCH_TIME_STAT_NR];
++
++	struct lock_held_stats lock_held_stats;
 +};
 +
 +static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
@@ -7128,7 +7272,7 @@ index 000000000..8b4d0eb5c
 +#endif /* _BCACHEFS_H */
 diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
 new file mode 100644
-index 000000000..147fde141
+index 000000000000..147fde1417b0
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs_format.h
 @@ -0,0 +1,2052 @@
@@ -9186,7 +9330,7 @@ index 000000000..147fde141
 +#endif /* _BCACHEFS_FORMAT_H */
 diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
 new file mode 100644
-index 000000000..b2edabf58
+index 000000000000..b2edabf58260
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs_ioctl.h
 @@ -0,0 +1,368 @@
@@ -9560,7 +9704,7 @@ index 000000000..b2edabf58
 +#endif /* _BCACHEFS_IOCTL_H */
 diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
 new file mode 100644
-index 000000000..cc0689635
+index 000000000000..cc0689635164
 --- /dev/null
 +++ b/fs/bcachefs/bkey.c
 @@ -0,0 +1,1175 @@
@@ -10741,7 +10885,7 @@ index 000000000..cc0689635
 +#endif
 diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
 new file mode 100644
-index 000000000..7dee3d8e0
+index 000000000000..7dee3d8e0a3d
 --- /dev/null
 +++ b/fs/bcachefs/bkey.h
 @@ -0,0 +1,566 @@
@@ -11313,7 +11457,7 @@ index 000000000..7dee3d8e0
 +#endif /* _BCACHEFS_BKEY_H */
 diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
 new file mode 100644
-index 000000000..0d7c67a95
+index 000000000000..0d7c67a959af
 --- /dev/null
 +++ b/fs/bcachefs/bkey_buf.h
 @@ -0,0 +1,60 @@
@@ -11379,7 +11523,7 @@ index 000000000..0d7c67a95
 +#endif /* _BCACHEFS_BKEY_BUF_H */
 diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
 new file mode 100644
-index 000000000..e0cbac881
+index 000000000000..e0cbac8811af
 --- /dev/null
 +++ b/fs/bcachefs/bkey_methods.c
 @@ -0,0 +1,503 @@
@@ -11888,7 +12032,7 @@ index 000000000..e0cbac881
 +}
 diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
 new file mode 100644
-index 000000000..db894b40d
+index 000000000000..db894b40d2ca
 --- /dev/null
 +++ b/fs/bcachefs/bkey_methods.h
 @@ -0,0 +1,175 @@
@@ -12069,7 +12213,7 @@ index 000000000..db894b40d
 +#endif /* _BCACHEFS_BKEY_METHODS_H */
 diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
 new file mode 100644
-index 000000000..b1385a77d
+index 000000000000..b1385a77da11
 --- /dev/null
 +++ b/fs/bcachefs/bkey_sort.c
 @@ -0,0 +1,198 @@
@@ -12273,7 +12417,7 @@ index 000000000..b1385a77d
 +}
 diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
 new file mode 100644
-index 000000000..79cf11d1b
+index 000000000000..79cf11d1b4e7
 --- /dev/null
 +++ b/fs/bcachefs/bkey_sort.h
 @@ -0,0 +1,44 @@
@@ -12323,7 +12467,7 @@ index 000000000..79cf11d1b
 +#endif /* _BCACHEFS_BKEY_SORT_H */
 diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
 new file mode 100644
-index 000000000..fa60ef84e
+index 000000000000..fa60ef84e4ef
 --- /dev/null
 +++ b/fs/bcachefs/bset.c
 @@ -0,0 +1,1598 @@
@@ -13927,7 +14071,7 @@ index 000000000..fa60ef84e
 +}
 diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
 new file mode 100644
-index 000000000..0d46534c3
+index 000000000000..0d46534c3dcd
 --- /dev/null
 +++ b/fs/bcachefs/bset.h
 @@ -0,0 +1,615 @@
@@ -14548,10 +14692,10 @@ index 000000000..0d46534c3
 +#endif /* _BCACHEFS_BSET_H */
 diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
 new file mode 100644
-index 000000000..4d032ae3b
+index 000000000000..579a8f8c6a65
 --- /dev/null
 +++ b/fs/bcachefs/btree_cache.c
-@@ -0,0 +1,1162 @@
+@@ -0,0 +1,1170 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -14561,6 +14705,7 @@ index 000000000..4d032ae3b
 +#include "btree_iter.h"
 +#include "btree_locking.h"
 +#include "debug.h"
++#include "errcode.h"
 +#include "error.h"
 +
 +#include <linux/prefetch.h>
@@ -15256,8 +15401,7 @@ index 000000000..4d032ae3b
 +	if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
 +		trace_trans_restart_relock_parent_for_fill(trans->fn,
 +					_THIS_IP_, btree_id, &path->pos);
-+		btree_trans_restart(trans);
-+		return ERR_PTR(-EINTR);
++		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
 +	}
 +
 +	b = bch2_btree_node_mem_alloc(c, level != 0);
@@ -15266,8 +15410,8 @@ index 000000000..4d032ae3b
 +		trans->memory_allocation_failure = true;
 +		trace_trans_restart_memory_allocation_failure(trans->fn,
 +				_THIS_IP_, btree_id, &path->pos);
-+		btree_trans_restart(trans);
-+		return ERR_PTR(-EINTR);
++
++		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
 +	}
 +
 +	if (IS_ERR(b))
@@ -15304,18 +15448,19 @@ index 000000000..4d032ae3b
 +	if (!sync)
 +		return NULL;
 +
-+	if (trans &&
-+	    (!bch2_trans_relock(trans) ||
-+	     !bch2_btree_path_relock_intent(trans, path))) {
-+		BUG_ON(!trans->restarted);
-+		return ERR_PTR(-EINTR);
++	if (trans) {
++		int ret = bch2_trans_relock(trans) ?:
++			bch2_btree_path_relock_intent(trans, path);
++		if (ret) {
++			BUG_ON(!trans->restarted);
++			return ERR_PTR(ret);
++		}
 +	}
 +
 +	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
 +		trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
 +					   btree_id, &path->pos);
-+		btree_trans_restart(trans);
-+		return ERR_PTR(-EINTR);
++		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
 +	}
 +
 +	return b;
@@ -15326,7 +15471,9 @@ index 000000000..4d032ae3b
 +	struct btree *b = container_of(lock, struct btree, c.lock);
 +	const struct bkey_i *k = p;
 +
-+	return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
++	if (b->hash_val != btree_ptr_hash_val(k))
++		return BCH_ERR_lock_fail_node_reused;
++	return 0;
 +}
 +
 +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
@@ -15385,6 +15532,7 @@ index 000000000..4d032ae3b
 +	struct btree_cache *bc = &c->btree_cache;
 +	struct btree *b;
 +	struct bset_tree *t;
++	int ret;
 +
 +	EBUG_ON(level >= BTREE_MAX_DEPTH);
 +
@@ -15447,13 +15595,16 @@ index 000000000..4d032ae3b
 +		 * was removed - and we'll bail out:
 +		 */
 +		if (btree_node_read_locked(path, level + 1))
-+			btree_node_unlock(path, level + 1);
++			btree_node_unlock(trans, path, level + 1);
 +
-+		if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
-+				     lock_node_check_fn, (void *) k, trace_ip)) {
-+			if (!trans->restarted)
++		ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type,
++				      lock_node_check_fn, (void *) k, trace_ip);
++		if (unlikely(ret)) {
++			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
 +				goto retry;
-+			return ERR_PTR(-EINTR);
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				return ERR_PTR(ret);
++			BUG();
 +		}
 +
 +		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
@@ -15467,8 +15618,7 @@ index 000000000..4d032ae3b
 +							      trace_ip,
 +							      path->btree_id,
 +							      &path->pos);
-+			btree_trans_restart(trans);
-+			return ERR_PTR(-EINTR);
++			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
 +		}
 +	}
 +
@@ -15484,11 +15634,13 @@ index 000000000..4d032ae3b
 +		 * should_be_locked is not set on this path yet, so we need to
 +		 * relock it specifically:
 +		 */
-+		if (trans &&
-+		    (!bch2_trans_relock(trans) ||
-+		     !bch2_btree_path_relock_intent(trans, path))) {
-+			BUG_ON(!trans->restarted);
-+			return ERR_PTR(-EINTR);
++		if (trans) {
++			int ret = bch2_trans_relock(trans) ?:
++				bch2_btree_path_relock_intent(trans, path);
++			if (ret) {
++				BUG_ON(!trans->restarted);
++				return ERR_PTR(ret);
++			}
 +		}
 +
 +		if (!six_relock_type(&b->c.lock, lock_type, seq))
@@ -15716,7 +15868,7 @@ index 000000000..4d032ae3b
 +}
 diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
 new file mode 100644
-index 000000000..25906127c
+index 000000000000..25906127c023
 --- /dev/null
 +++ b/fs/bcachefs/btree_cache.h
 @@ -0,0 +1,107 @@
@@ -15829,10 +15981,10 @@ index 000000000..25906127c
 +#endif /* _BCACHEFS_BTREE_CACHE_H */
 diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
 new file mode 100644
-index 000000000..0e2c8745c
+index 000000000000..2f563365ea4c
 --- /dev/null
 +++ b/fs/bcachefs/btree_gc.c
-@@ -0,0 +1,2128 @@
+@@ -0,0 +1,2098 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
@@ -15933,7 +16085,7 @@ index 000000000..0e2c8745c
 +				  buf1.buf, buf2.buf) &&
 +			    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 +				bch_info(c, "Halting mark and sweep to start topology repair pass");
-+				ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
++				ret = -BCH_ERR_need_topology_repair;
 +				goto err;
 +			} else {
 +				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
@@ -15961,7 +16113,7 @@ index 000000000..0e2c8745c
 +			  buf1.buf, buf2.buf) &&
 +		    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 +			bch_info(c, "Halting mark and sweep to start topology repair pass");
-+			ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
++			ret = -BCH_ERR_need_topology_repair;
 +			goto err;
 +		} else {
 +			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
@@ -16237,8 +16389,8 @@ index 000000000..0e2c8745c
 +		}
 +
 +		if (ret) {
-+			bch_err(c, "%s: error %i getting btree node",
-+				__func__, ret);
++			bch_err(c, "%s: error getting btree node: %s",
++				__func__, bch2_err_str(ret));
 +			break;
 +		}
 +
@@ -16306,8 +16458,8 @@ index 000000000..0e2c8745c
 +		ret = PTR_ERR_OR_ZERO(cur);
 +
 +		if (ret) {
-+			bch_err(c, "%s: error %i getting btree node",
-+				__func__, ret);
++			bch_err(c, "%s: error getting btree node: %s",
++				__func__, bch2_err_str(ret));
 +			goto err;
 +		}
 +
@@ -16372,7 +16524,7 @@ index 000000000..0e2c8745c
 +
 +		if (ret == DROP_THIS_NODE) {
 +			bch_err(c, "empty btree root - repair unimplemented");
-+			ret = FSCK_ERR_EXIT;
++			ret = -BCH_ERR_fsck_repair_unimplemented;
 +		}
 +	}
 +
@@ -16399,7 +16551,8 @@ index 000000000..0e2c8745c
 +		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
 +		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 +
-+		if (fsck_err_on(!g->gen_valid, c,
++		if (c->opts.reconstruct_alloc ||
++		    fsck_err_on(!g->gen_valid, c,
 +				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
 +				"while marking %s",
 +				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -16583,13 +16736,15 @@ index 000000000..0e2c8745c
 +		if (level)
 +			bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
 +
-+		printbuf_reset(&buf);
-+		bch2_bkey_val_to_text(&buf, c, *k);
-+		bch_info(c, "updated %s", buf.buf);
++		if (c->opts.verbose) {
++			printbuf_reset(&buf);
++			bch2_bkey_val_to_text(&buf, c, *k);
++			bch_info(c, "updated %s", buf.buf);
 +
-+		printbuf_reset(&buf);
-+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
-+		bch_info(c, "new key %s", buf.buf);
++			printbuf_reset(&buf);
++			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
++			bch_info(c, "new key %s", buf.buf);
++		}
 +
 +		*k = bkey_i_to_s_c(new);
 +	}
@@ -16631,12 +16786,12 @@ index 000000000..0e2c8745c
 +			atomic64_set(&c->key_version, k->k->version.lo);
 +	}
 +
-+	ret = __bch2_trans_do(trans, NULL, NULL, 0,
++	ret = commit_do(trans, NULL, NULL, 0,
 +			bch2_mark_key(trans, old, *k, flags));
 +fsck_err:
 +err:
 +	if (ret)
-+		bch_err(c, "%s: ret %i", __func__, ret);
++		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -16742,7 +16897,8 @@ index 000000000..0e2c8745c
 +		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
 +				       false, &k, true);
 +		if (ret) {
-+			bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
++			bch_err(c, "%s: error from bch2_gc_mark_key: %s",
++				__func__, bch2_err_str(ret));
 +			goto fsck_err;
 +		}
 +
@@ -16791,7 +16947,7 @@ index 000000000..0e2c8745c
 +					  (printbuf_reset(&buf),
 +					   bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
 +				    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
-+					ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
++					ret = -BCH_ERR_need_topology_repair;
 +					bch_info(c, "Halting mark and sweep to start topology repair pass");
 +					goto fsck_err;
 +				} else {
@@ -16802,8 +16958,8 @@ index 000000000..0e2c8745c
 +					continue;
 +				}
 +			} else if (ret) {
-+				bch_err(c, "%s: error %i getting btree node",
-+					__func__, ret);
++				bch_err(c, "%s: error getting btree node: %s",
++					__func__, bch2_err_str(ret));
 +				break;
 +			}
 +
@@ -16844,7 +17000,7 @@ index 000000000..0e2c8745c
 +	if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
 +			"btree root with incorrect min_key: %s", buf.buf)) {
 +		bch_err(c, "repair unimplemented");
-+		ret = FSCK_ERR_EXIT;
++		ret = -BCH_ERR_fsck_repair_unimplemented;
 +		goto fsck_err;
 +	}
 +
@@ -16853,7 +17009,7 @@ index 000000000..0e2c8745c
 +	if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
 +			"btree root with incorrect max_key: %s", buf.buf)) {
 +		bch_err(c, "repair unimplemented");
-+		ret = FSCK_ERR_EXIT;
++		ret = -BCH_ERR_fsck_repair_unimplemented;
 +		goto fsck_err;
 +	}
 +
@@ -16870,7 +17026,7 @@ index 000000000..0e2c8745c
 +	six_unlock_read(&b->c.lock);
 +
 +	if (ret < 0)
-+		bch_err(c, "%s: ret %i", __func__, ret);
++		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 +	printbuf_exit(&buf);
 +	return ret;
 +}
@@ -16903,7 +17059,7 @@ index 000000000..0e2c8745c
 +			: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
 +
 +	if (ret < 0)
-+		bch_err(c, "%s: ret %i", __func__, ret);
++		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -17012,29 +17168,28 @@ index 000000000..0e2c8745c
 +{
 +	struct bch_dev *ca = NULL;
 +	struct printbuf buf = PRINTBUF;
-+	bool verify = !metadata_only && (!initial ||
-+		       (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
++	bool verify = !metadata_only &&
++		!c->opts.reconstruct_alloc &&
++		(!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
 +	unsigned i, dev;
 +	int ret = 0;
 +
 +	percpu_down_write(&c->mark_lock);
 +
 +#define copy_field(_f, _msg, ...)					\
-+	if (dst->_f != src->_f) {					\
-+		if (verify)						\
-+			fsck_err(c, _msg ": got %llu, should be %llu"	\
-+				, ##__VA_ARGS__, dst->_f, src->_f);	\
-+		dst->_f = src->_f;					\
-+	}
++	if (dst->_f != src->_f &&					\
++	    (!verify ||							\
++	     fsck_err(c, _msg ": got %llu, should be %llu"		\
++		      , ##__VA_ARGS__, dst->_f, src->_f)))		\
++		dst->_f = src->_f
 +#define copy_stripe_field(_f, _msg, ...)				\
-+	if (dst->_f != src->_f) {					\
-+		if (verify)						\
-+			fsck_err(c, "stripe %zu has wrong "_msg		\
-+				": got %u, should be %u",		\
-+				iter.pos, ##__VA_ARGS__,		\
-+				dst->_f, src->_f);			\
-+		dst->_f = src->_f;					\
-+	}
++	if (dst->_f != src->_f &&					\
++	    (!verify ||							\
++	     fsck_err(c, "stripe %zu has wrong "_msg			\
++		      ": got %u, should be %u",				\
++		      iter.pos, ##__VA_ARGS__,				\
++		      dst->_f, src->_f)))				\
++		dst->_f = src->_f
 +#define copy_dev_field(_f, _msg, ...)					\
 +	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 +#define copy_fs_field(_f, _msg, ...)					\
@@ -17102,7 +17257,7 @@ index 000000000..0e2c8745c
 +	if (ca)
 +		percpu_ref_put(&ca->ref);
 +	if (ret)
-+		bch_err(c, "%s: ret %i", __func__, ret);
++		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 +
 +	percpu_up_write(&c->mark_lock);
 +	printbuf_exit(&buf);
@@ -17157,21 +17312,19 @@ index 000000000..0e2c8745c
 +
 +static int bch2_alloc_write_key(struct btree_trans *trans,
 +				struct btree_iter *iter,
++				struct bkey_s_c k,
 +				bool metadata_only)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
 +	struct bucket gc, *b;
-+	struct bkey_s_c k;
 +	struct bkey_i_alloc_v4 *a;
 +	struct bch_alloc_v4 old, new;
 +	enum bch_data_type type;
 +	int ret;
 +
-+	k = bch2_btree_iter_peek_slot(iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		return ret;
++	if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
++		return 1;
 +
 +	bch2_alloc_to_v4(k, &old);
 +	new = old;
@@ -17212,7 +17365,8 @@ index 000000000..0e2c8745c
 +		return 0;
 +
 +#define copy_bucket_field(_f)						\
-+	if (fsck_err_on(new._f != gc._f, c,				\
++	if (c->opts.reconstruct_alloc ||				\
++	    fsck_err_on(new._f != gc._f, c,				\
 +			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
 +			": got %u, should be %u",			\
 +			iter->pos.inode, iter->pos.offset,		\
@@ -17263,31 +17417,21 @@ index 000000000..0e2c8745c
 +	bch2_trans_init(&trans, c, 0, 0);
 +
 +	for_each_member_device(ca, c, i) {
-+		for_each_btree_key(&trans, iter, BTREE_ID_alloc,
-+				   POS(ca->dev_idx, ca->mi.first_bucket),
-+				   BTREE_ITER_SLOTS|
-+				   BTREE_ITER_PREFETCH, k, ret) {
-+			if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
-+				break;
++		ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
++				POS(ca->dev_idx, ca->mi.first_bucket),
++				BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
++				NULL, NULL, BTREE_INSERT_LAZY_RW,
++			bch2_alloc_write_key(&trans, &iter, k, metadata_only));
 +
-+			ret = __bch2_trans_do(&trans, NULL, NULL,
-+					      BTREE_INSERT_LAZY_RW,
-+					bch2_alloc_write_key(&trans, &iter,
-+							     metadata_only));
-+			if (ret)
-+				break;
-+		}
-+		bch2_trans_iter_exit(&trans, &iter);
-+
-+		if (ret) {
-+			bch_err(c, "error writing alloc info: %i", ret);
++		if (ret < 0) {
++			bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
 +			percpu_ref_put(&ca->ref);
 +			break;
 +		}
 +	}
 +
 +	bch2_trans_exit(&trans);
-+	return ret;
++	return ret < 0 ? ret : 0;
 +}
 +
 +static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
@@ -17344,7 +17488,7 @@ index 000000000..0e2c8745c
 +	bch2_trans_exit(&trans);
 +
 +	if (ret)
-+		bch_err(c, "error reading alloc info at gc start: %i", ret);
++		bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
 +
 +	return ret;
 +}
@@ -17371,14 +17515,64 @@ index 000000000..0e2c8745c
 +	};
 +}
 +
++static int bch2_gc_write_reflink_key(struct btree_trans *trans,
++				     struct btree_iter *iter,
++				     struct bkey_s_c k,
++				     size_t *idx)
++{
++	struct bch_fs *c = trans->c;
++	const __le64 *refcount = bkey_refcount_c(k);
++	struct printbuf buf = PRINTBUF;
++	struct reflink_gc *r;
++	int ret = 0;
++
++	if (!refcount)
++		return 0;
++
++	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
++	       r->offset < k.k->p.offset)
++		++*idx;
++
++	if (!r ||
++	    r->offset != k.k->p.offset ||
++	    r->size != k.k->size) {
++		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
++		return -EINVAL;
++	}
++
++	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
++			"reflink key has wrong refcount:\n"
++			"  %s\n"
++			"  should be %u",
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
++			r->refcount)) {
++		struct bkey_i *new;
++
++		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++		ret = PTR_ERR_OR_ZERO(new);
++		if (ret)
++			return ret;
++
++		bkey_reassemble(new, k);
++
++		if (!r->refcount)
++			new->k.type = KEY_TYPE_deleted;
++		else
++			*bkey_refcount(new) = cpu_to_le64(r->refcount);
++
++		ret = bch2_trans_update(trans, iter, new, 0);
++	}
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
 +static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 +{
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	struct reflink_gc *r;
 +	size_t idx = 0;
-+	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
 +	if (metadata_only)
@@ -17386,57 +17580,14 @@ index 000000000..0e2c8745c
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
-+			   BTREE_ITER_PREFETCH, k, ret) {
-+		const __le64 *refcount = bkey_refcount_c(k);
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_reflink, POS_MIN,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL,
++		bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
 +
-+		if (!refcount)
-+			continue;
-+
-+		r = genradix_ptr(&c->reflink_gc_table, idx++);
-+		if (!r ||
-+		    r->offset != k.k->p.offset ||
-+		    r->size != k.k->size) {
-+			bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-+			ret = -EINVAL;
-+			break;
-+		}
-+
-+		if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
-+				"reflink key has wrong refcount:\n"
-+				"  %s\n"
-+				"  should be %u",
-+				(printbuf_reset(&buf),
-+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-+				r->refcount)) {
-+			struct bkey_i *new;
-+
-+			new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-+			if (!new) {
-+				ret = -ENOMEM;
-+				break;
-+			}
-+
-+			bkey_reassemble(new, k);
-+
-+			if (!r->refcount)
-+				new->k.type = KEY_TYPE_deleted;
-+			else
-+				*bkey_refcount(new) = cpu_to_le64(r->refcount);
-+
-+			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-+				__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
-+			kfree(new);
-+
-+			if (ret)
-+				break;
-+		}
-+	}
-+fsck_err:
-+	bch2_trans_iter_exit(&trans, &iter);
 +	c->reflink_gc_nr = 0;
 +	bch2_trans_exit(&trans);
-+	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -17488,15 +17639,59 @@ index 000000000..0e2c8745c
 +		r->refcount = 0;
 +}
 +
++static int bch2_gc_write_stripes_key(struct btree_trans *trans,
++				     struct btree_iter *iter,
++				     struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	struct printbuf buf = PRINTBUF;
++	const struct bch_stripe *s;
++	struct gc_stripe *m;
++	unsigned i;
++	int ret = 0;
++
++	if (k.k->type != KEY_TYPE_stripe)
++		return 0;
++
++	s = bkey_s_c_to_stripe(k).v;
++	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
++
++	for (i = 0; i < s->nr_blocks; i++)
++		if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
++			goto inconsistent;
++	return 0;
++inconsistent:
++	if (fsck_err_on(true, c,
++			"stripe has wrong block sector count %u:\n"
++			"  %s\n"
++			"  should be %u", i,
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
++			m ? m->block_sectors[i] : 0)) {
++		struct bkey_i_stripe *new;
++
++		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++		ret = PTR_ERR_OR_ZERO(new);
++		if (ret)
++			return ret;
++
++		bkey_reassemble(&new->k_i, k);
++
++		for (i = 0; i < new->v.nr_blocks; i++)
++			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
++
++		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
++	}
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
 +static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
 +{
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	struct gc_stripe *m;
-+	const struct bch_stripe *s;
-+	struct printbuf buf = PRINTBUF;
-+	unsigned i;
 +	int ret = 0;
 +
 +	if (metadata_only)
@@ -17504,50 +17699,13 @@ index 000000000..0e2c8745c
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
-+			   BTREE_ITER_PREFETCH, k, ret) {
-+		if (k.k->type != KEY_TYPE_stripe)
-+			continue;
-+
-+		s = bkey_s_c_to_stripe(k).v;
-+		m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-+
-+		for (i = 0; i < s->nr_blocks; i++)
-+			if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
-+				goto inconsistent;
-+		continue;
-+inconsistent:
-+		if (fsck_err_on(true, c,
-+				"stripe has wrong block sector count %u:\n"
-+				"  %s\n"
-+				"  should be %u", i,
-+				(printbuf_reset(&buf),
-+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-+				m ? m->block_sectors[i] : 0)) {
-+			struct bkey_i_stripe *new;
-+
-+			new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-+			if (!new) {
-+				ret = -ENOMEM;
-+				break;
-+			}
-+
-+			bkey_reassemble(&new->k_i, k);
-+
-+			for (i = 0; i < new->v.nr_blocks; i++)
-+				stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
-+
-+			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-+				__bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
-+			kfree(new);
-+		}
-+	}
-+fsck_err:
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_stripes, POS_MIN,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL,
++		bch2_gc_write_stripes_key(&trans, &iter, k));
 +
 +	bch2_trans_exit(&trans);
-+
-+	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -17609,7 +17767,7 @@ index 000000000..0e2c8745c
 +
 +	ret = bch2_gc_btrees(c, initial, metadata_only);
 +
-+	if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
++	if (ret == -BCH_ERR_need_topology_repair &&
 +	    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
 +	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
 +		set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
@@ -17617,8 +17775,8 @@ index 000000000..0e2c8745c
 +		ret = 0;
 +	}
 +
-+	if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR)
-+		ret = FSCK_ERR_EXIT;
++	if (ret == -BCH_ERR_need_topology_repair)
++		ret = -BCH_ERR_fsck_errors_not_fixed;
 +
 +	if (ret)
 +		goto out;
@@ -17680,10 +17838,15 @@ index 000000000..0e2c8745c
 +	return ret;
 +}
 +
-+static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
++static int gc_btree_gens_key(struct btree_trans *trans,
++			     struct btree_iter *iter,
++			     struct bkey_s_c k)
 +{
++	struct bch_fs *c = trans->c;
 +	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 +	const struct bch_extent_ptr *ptr;
++	struct bkey_i *u;
++	int ret;
 +
 +	percpu_down_read(&c->mark_lock);
 +	bkey_for_each_ptr(ptrs, ptr) {
@@ -17691,7 +17854,7 @@ index 000000000..0e2c8745c
 +
 +		if (ptr_stale(ca, ptr) > 16) {
 +			percpu_up_read(&c->mark_lock);
-+			return true;
++			goto update;
 +		}
 +	}
 +
@@ -17703,77 +17866,27 @@ index 000000000..0e2c8745c
 +			*gen = ptr->gen;
 +	}
 +	percpu_up_read(&c->mark_lock);
++	return 0;
++update:
++	u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(u);
++	if (ret)
++		return ret;
 +
-+	return false;
++	bkey_reassemble(u, k);
++
++	bch2_extent_normalize(c, bkey_i_to_s(u));
++	return bch2_trans_update(trans, iter, u, 0);
 +}
 +
-+/*
-+ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
-+ * node pointers currently never have cached pointers that can become stale:
-+ */
-+static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	struct bkey_buf sk;
-+	int ret = 0, commit_err = 0;
-+
-+	bch2_bkey_buf_init(&sk);
-+
-+	bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_NOT_EXTENTS|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+
-+	while ((bch2_trans_begin(trans),
-+		k = bch2_btree_iter_peek(&iter)).k) {
-+		ret = bkey_err(k);
-+
-+		if (ret == -EINTR)
-+			continue;
-+		if (ret)
-+			break;
-+
-+		c->gc_gens_pos = iter.pos;
-+
-+		if (gc_btree_gens_key(c, k) && !commit_err) {
-+			bch2_bkey_buf_reassemble(&sk, c, k);
-+			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-+
-+			commit_err =
-+				bch2_trans_update(trans, &iter, sk.k, 0) ?:
-+				bch2_trans_commit(trans, NULL, NULL,
-+						  BTREE_INSERT_NOWAIT|
-+						  BTREE_INSERT_NOFAIL);
-+			if (commit_err == -EINTR) {
-+				commit_err = 0;
-+				continue;
-+			}
-+		}
-+
-+		bch2_btree_iter_advance(&iter);
-+	}
-+	bch2_trans_iter_exit(trans, &iter);
-+
-+	bch2_bkey_buf_exit(&sk, c);
-+
-+	return ret;
-+}
-+
-+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
++static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
++				       struct bkey_s_c k)
 +{
 +	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
-+	struct bkey_s_c k;
 +	struct bch_alloc_v4 a;
 +	struct bkey_i_alloc_v4 *a_mut;
 +	int ret;
 +
-+	k = bch2_btree_iter_peek_slot(iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		return ret;
-+
 +	bch2_alloc_to_v4(k, &a);
 +
 +	if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
@@ -17833,26 +17946,35 @@ index 000000000..0e2c8745c
 +
 +	for (i = 0; i < BTREE_ID_NR; i++)
 +		if ((1 << i) & BTREE_ID_HAS_PTRS) {
++			struct btree_iter iter;
++			struct bkey_s_c k;
++
 +			c->gc_gens_btree = i;
 +			c->gc_gens_pos = POS_MIN;
-+			ret = bch2_gc_btree_gens(&trans, i);
++			ret = for_each_btree_key_commit(&trans, iter, i,
++					POS_MIN,
++					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
++					k,
++					NULL, NULL,
++					BTREE_INSERT_NOFAIL,
++				gc_btree_gens_key(&trans, &iter, k));
 +			if (ret) {
-+				bch_err(c, "error recalculating oldest_gen: %i", ret);
++				bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
 +				goto err;
 +			}
 +		}
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-+			   BTREE_ITER_PREFETCH, k, ret) {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_NOFAIL,
-+				bch2_alloc_write_oldest_gen(&trans, &iter));
-+		if (ret) {
-+			bch_err(c, "error writing oldest_gen: %i", ret);
-+			break;
-+		}
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
++			POS_MIN,
++			BTREE_ITER_PREFETCH,
++			k,
++			NULL, NULL,
++			BTREE_INSERT_NOFAIL,
++		bch2_alloc_write_oldest_gen(&trans, &iter, k));
++	if (ret) {
++		bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
++		goto err;
 +	}
-+	bch2_trans_iter_exit(&trans, &iter);
 +
 +	c->gc_gens_btree	= 0;
 +	c->gc_gens_pos		= POS_MIN;
@@ -17922,7 +18044,7 @@ index 000000000..0e2c8745c
 +		ret = bch2_gc_gens(c);
 +#endif
 +		if (ret < 0)
-+			bch_err(c, "btree gc failed: %i", ret);
++			bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
 +
 +		debug_check_no_locks_held();
 +	}
@@ -17952,7 +18074,7 @@ index 000000000..0e2c8745c
 +
 +	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
 +	if (IS_ERR(p)) {
-+		bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
++		bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
 +		return PTR_ERR(p);
 +	}
 +
@@ -17963,7 +18085,7 @@ index 000000000..0e2c8745c
 +}
 diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
 new file mode 100644
-index 000000000..95d803b57
+index 000000000000..95d803b5743d
 --- /dev/null
 +++ b/fs/bcachefs/btree_gc.h
 @@ -0,0 +1,112 @@
@@ -18081,7 +18203,7 @@ index 000000000..95d803b57
 +#endif /* _BCACHEFS_BTREE_GC_H */
 diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
 new file mode 100644
-index 000000000..9bf3f77bc
+index 000000000000..ae731b3a3908
 --- /dev/null
 +++ b/fs/bcachefs/btree_io.c
 @@ -0,0 +1,2150 @@
@@ -18630,7 +18752,7 @@ index 000000000..9bf3f77bc
 +	struct printbuf out = PRINTBUF;					\
 +									\
 +	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
-+	prt_printf(&out, ": " msg, ##__VA_ARGS__);				\
++	prt_printf(&out, ": " msg, ##__VA_ARGS__);			\
 +									\
 +	if (type == BTREE_ERR_FIXABLE &&				\
 +	    write == READ &&						\
@@ -18645,7 +18767,7 @@ index 000000000..9bf3f77bc
 +									\
 +		switch (type) {						\
 +		case BTREE_ERR_FIXABLE:					\
-+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
++			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 +			goto fsck_err;					\
 +		case BTREE_ERR_WANT_RETRY:				\
 +			if (have_retry) {				\
@@ -18657,7 +18779,7 @@ index 000000000..9bf3f77bc
 +			ret = BTREE_RETRY_READ;				\
 +			goto fsck_err;					\
 +		case BTREE_ERR_FATAL:					\
-+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
++			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 +			goto fsck_err;					\
 +		}							\
 +		break;							\
@@ -18665,7 +18787,7 @@ index 000000000..9bf3f77bc
 +		bch_err(c, "corrupt metadata before write: %s", out.buf);\
 +									\
 +		if (bch2_fs_inconsistent(c)) {				\
-+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
++			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 +			goto fsck_err;					\
 +		}							\
 +		break;							\
@@ -20237,7 +20359,7 @@ index 000000000..9bf3f77bc
 +}
 diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
 new file mode 100644
-index 000000000..8af853642
+index 000000000000..8af853642123
 --- /dev/null
 +++ b/fs/bcachefs/btree_io.h
 @@ -0,0 +1,222 @@
@@ -20465,10 +20587,10 @@ index 000000000..8af853642
 +#endif /* _BCACHEFS_BTREE_IO_H */
 diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
 new file mode 100644
-index 000000000..a1512eb06
+index 000000000000..a90a45939aa3
 --- /dev/null
 +++ b/fs/bcachefs/btree_iter.c
-@@ -0,0 +1,3471 @@
+@@ -0,0 +1,3515 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -20487,6 +20609,7 @@ index 000000000..a1512eb06
 +#include "replicas.h"
 +#include "subvolume.h"
 +
++#include <linux/prandom.h>
 +#include <linux/prefetch.h>
 +#include <trace/events/bcachefs.h>
 +
@@ -20517,7 +20640,7 @@ index 000000000..a1512eb06
 +	if (need_resched() || race_fault()) {
 +		bch2_trans_unlock(trans);
 +		schedule();
-+		return bch2_trans_relock(trans) ? 0 : -EINTR;
++		return bch2_trans_relock(trans);
 +	} else {
 +		return 0;
 +	}
@@ -20664,12 +20787,14 @@ index 000000000..a1512eb06
 +		return true;
 +	}
 +fail:
-+	trace_btree_node_relock_fail(trans->fn, _RET_IP_,
-+				     path->btree_id,
-+				     &path->pos,
-+				     (unsigned long) b,
-+				     path->l[level].lock_seq,
-+				     is_btree_node(path, level) ? b->c.lock.state.seq : 0);
++	if (b != BTREE_ITER_NO_NODE_CACHED &&
++	    b != BTREE_ITER_NO_NODE_INIT)
++		trace_btree_node_relock_fail(trans->fn, _RET_IP_,
++					     path->btree_id,
++					     &path->pos,
++					     (unsigned long) b,
++					     path->l[level].lock_seq,
++					     is_btree_node(path, level) ? b->c.lock.state.seq : 0);
 +	return false;
 +}
 +
@@ -20705,7 +20830,7 @@ index 000000000..a1512eb06
 +
 +	if (btree_node_lock_seq_matches(path, b, level) &&
 +	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
-+		btree_node_unlock(path, level);
++		btree_node_unlock(trans, path, level);
 +		goto success;
 +	}
 +
@@ -20740,7 +20865,7 @@ index 000000000..a1512eb06
 +	 * the node that we failed to relock:
 +	 */
 +	if (fail_idx >= 0) {
-+		__bch2_btree_path_unlock(path);
++		__bch2_btree_path_unlock(trans, path);
 +		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 +
 +		do {
@@ -20766,13 +20891,13 @@ index 000000000..a1512eb06
 +}
 +
 +/* Slowpath: */
-+bool __bch2_btree_node_lock(struct btree_trans *trans,
-+			    struct btree_path *path,
-+			    struct btree *b,
-+			    struct bpos pos, unsigned level,
-+			    enum six_lock_type type,
-+			    six_lock_should_sleep_fn should_sleep_fn, void *p,
-+			    unsigned long ip)
++int __bch2_btree_node_lock(struct btree_trans *trans,
++			   struct btree_path *path,
++			   struct btree *b,
++			   struct bpos pos, unsigned level,
++			   enum six_lock_type type,
++			   six_lock_should_sleep_fn should_sleep_fn, void *p,
++			   unsigned long ip)
 +{
 +	struct btree_path *linked;
 +	unsigned reason;
@@ -20834,7 +20959,6 @@ index 000000000..a1512eb06
 +		if (btree_node_locked(linked, level) &&
 +		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
 +						 linked->cached)) <= 0) {
-+			BUG_ON(trans->in_traverse_all);
 +			reason = 7;
 +			goto deadlock;
 +		}
@@ -20851,8 +20975,7 @@ index 000000000..a1512eb06
 +			path->btree_id,
 +			path->cached,
 +			&pos);
-+	btree_trans_restart(trans);
-+	return false;
++	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
 +}
 +
 +/* Btree iterator locking: */
@@ -20890,8 +21013,8 @@ index 000000000..a1512eb06
 +/*
 + * Only for btree_cache.c - only relocks intent locks
 + */
-+bool bch2_btree_path_relock_intent(struct btree_trans *trans,
-+				   struct btree_path *path)
++int bch2_btree_path_relock_intent(struct btree_trans *trans,
++				  struct btree_path *path)
 +{
 +	unsigned l;
 +
@@ -20899,20 +21022,19 @@ index 000000000..a1512eb06
 +	     l < path->locks_want && btree_path_node(path, l);
 +	     l++) {
 +		if (!bch2_btree_node_relock(trans, path, l)) {
-+			__bch2_btree_path_unlock(path);
++			__bch2_btree_path_unlock(trans, path);
 +			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 +			trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
 +						   path->btree_id, &path->pos);
-+			btree_trans_restart(trans);
-+			return false;
++			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
 +		}
 +	}
 +
-+	return true;
++	return 0;
 +}
 +
 +__flatten
-+static bool bch2_btree_path_relock(struct btree_trans *trans,
++static int bch2_btree_path_relock(struct btree_trans *trans,
 +			struct btree_path *path, unsigned long trace_ip)
 +{
 +	bool ret = btree_path_get_locks(trans, path, false);
@@ -20920,9 +21042,10 @@ index 000000000..a1512eb06
 +	if (!ret) {
 +		trace_trans_restart_relock_path(trans->fn, trace_ip,
 +						path->btree_id, &path->pos);
-+		btree_trans_restart(trans);
++		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
 +	}
-+	return ret;
++
++	return 0;
 +}
 +
 +bool __bch2_btree_path_upgrade(struct btree_trans *trans,
@@ -20970,7 +21093,8 @@ index 000000000..a1512eb06
 +	return false;
 +}
 +
-+void __bch2_btree_path_downgrade(struct btree_path *path,
++void __bch2_btree_path_downgrade(struct btree_trans *trans,
++				 struct btree_path *path,
 +				 unsigned new_locks_want)
 +{
 +	unsigned l;
@@ -20982,7 +21106,7 @@ index 000000000..a1512eb06
 +	while (path->nodes_locked &&
 +	       (l = __fls(path->nodes_locked)) >= path->locks_want) {
 +		if (l > path->level) {
-+			btree_node_unlock(path, l);
++			btree_node_unlock(trans, path, l);
 +		} else {
 +			if (btree_node_intent_locked(path, l)) {
 +				six_lock_downgrade(&path->l[l].b->c.lock);
@@ -21000,27 +21124,27 @@ index 000000000..a1512eb06
 +	struct btree_path *path;
 +
 +	trans_for_each_path(trans, path)
-+		bch2_btree_path_downgrade(path);
++		bch2_btree_path_downgrade(trans, path);
 +}
 +
 +/* Btree transaction locking: */
 +
-+bool bch2_trans_relock(struct btree_trans *trans)
++int bch2_trans_relock(struct btree_trans *trans)
 +{
 +	struct btree_path *path;
 +
 +	if (unlikely(trans->restarted))
-+		return false;
++		return -BCH_ERR_transaction_restart_relock;
 +
 +	trans_for_each_path(trans, path)
 +		if (path->should_be_locked &&
-+		    !bch2_btree_path_relock(trans, path, _RET_IP_)) {
++		    bch2_btree_path_relock(trans, path, _RET_IP_)) {
 +			trace_trans_restart_relock(trans->fn, _RET_IP_,
 +					path->btree_id, &path->pos);
 +			BUG_ON(!trans->restarted);
-+			return false;
++			return -BCH_ERR_transaction_restart_relock;
 +		}
-+	return true;
++	return 0;
 +}
 +
 +void bch2_trans_unlock(struct btree_trans *trans)
@@ -21028,7 +21152,7 @@ index 000000000..a1512eb06
 +	struct btree_path *path;
 +
 +	trans_for_each_path(trans, path)
-+		__bch2_btree_path_unlock(path);
++		__bch2_btree_path_unlock(trans, path);
 +
 +	/*
 +	 * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
@@ -21056,7 +21180,7 @@ index 000000000..a1512eb06
 +	       bkey_cmp(ck->key.pos, path->pos));
 +
 +	if (!locked)
-+		btree_node_unlock(path, 0);
++		btree_node_unlock(trans, path, 0);
 +}
 +
 +static void bch2_btree_path_verify_level(struct btree_trans *trans,
@@ -21113,7 +21237,7 @@ index 000000000..a1512eb06
 +	}
 +
 +	if (!locked)
-+		btree_node_unlock(path, level);
++		btree_node_unlock(trans, path, level);
 +	return;
 +err:
 +	bch2_bpos_to_text(&buf1, path->pos);
@@ -21490,27 +21614,29 @@ index 000000000..a1512eb06
 +			bch2_btree_node_iter_peek_all(&l->iter, l->b));
 +}
 +
-+static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
++static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
 +						    struct btree_path *path,
 +						    struct btree_path_level *l,
 +						    struct bkey *u)
 +{
-+	struct bkey_s_c k = __btree_iter_unpack(c, l, u,
++	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
 +			bch2_btree_node_iter_peek(&l->iter, l->b));
 +
 +	path->pos = k.k ? k.k->p : l->b->key.k.p;
++	bch2_btree_path_verify_level(trans, path, l - path->l);
 +	return k;
 +}
 +
-+static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c,
++static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
 +						    struct btree_path *path,
 +						    struct btree_path_level *l,
 +						    struct bkey *u)
 +{
-+	struct bkey_s_c k = __btree_iter_unpack(c, l, u,
++	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
 +			bch2_btree_node_iter_prev(&l->iter, l->b));
 +
 +	path->pos = k.k ? k.k->p : l->b->data->min_key;
++	bch2_btree_path_verify_level(trans, path, l - path->l);
 +	return k;
 +}
 +
@@ -21585,7 +21711,7 @@ index 000000000..a1512eb06
 +	}
 +
 +	if (!parent_locked)
-+		btree_node_unlock(path, plevel);
++		btree_node_unlock(trans, path, plevel);
 +}
 +
 +static inline void __btree_path_level_init(struct btree_path *path,
@@ -21637,7 +21763,7 @@ index 000000000..a1512eb06
 +
 +			if (path->nodes_locked &&
 +			    t != BTREE_NODE_UNLOCKED) {
-+				btree_node_unlock(path, b->c.level);
++				btree_node_unlock(trans, path, b->c.level);
 +				six_lock_increment(&b->c.lock, t);
 +				mark_btree_node_locked(trans, path, b->c.level, t);
 +			}
@@ -21665,7 +21791,9 @@ index 000000000..a1512eb06
 +	struct btree *b = container_of(lock, struct btree, c.lock);
 +	struct btree **rootp = p;
 +
-+	return b == *rootp ? 0 : -1;
++	if (b != *rootp)
++		return BCH_ERR_lock_fail_root_changed;
++	return 0;
 +}
 +
 +static inline int btree_path_lock_root(struct btree_trans *trans,
@@ -21677,6 +21805,7 @@ index 000000000..a1512eb06
 +	struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
 +	enum six_lock_type lock_type;
 +	unsigned i;
++	int ret;
 +
 +	EBUG_ON(path->nodes_locked);
 +
@@ -21698,13 +21827,16 @@ index 000000000..a1512eb06
 +		}
 +
 +		lock_type = __btree_lock_want(path, path->level);
-+		if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
-+					      path->level, lock_type,
-+					      lock_root_check_fn, rootp,
-+					      trace_ip))) {
-+			if (trans->restarted)
-+				return -EINTR;
-+			continue;
++		ret = btree_node_lock(trans, path, b, SPOS_MAX,
++				      path->level, lock_type,
++				      lock_root_check_fn, rootp,
++				      trace_ip);
++		if (unlikely(ret)) {
++			if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
++				continue;
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				return ret;
++			BUG();
 +		}
 +
 +		if (likely(b == READ_ONCE(*rootp) &&
@@ -21756,7 +21888,7 @@ index 000000000..a1512eb06
 +	}
 +
 +	if (!was_locked)
-+		btree_node_unlock(path, path->level);
++		btree_node_unlock(trans, path, path->level);
 +
 +	bch2_bkey_buf_exit(&tmp, c);
 +	return ret;
@@ -21791,7 +21923,7 @@ index 000000000..a1512eb06
 +	}
 +
 +	if (!was_locked)
-+		btree_node_unlock(path, path->level);
++		btree_node_unlock(trans, path, path->level);
 +
 +	bch2_bkey_buf_exit(&tmp, c);
 +	return ret;
@@ -21816,7 +21948,7 @@ index 000000000..a1512eb06
 +	bp->mem_ptr = (unsigned long)b;
 +
 +	if (!locked)
-+		btree_node_unlock(path, plevel);
++		btree_node_unlock(trans, path, plevel);
 +}
 +
 +static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
@@ -21889,7 +22021,7 @@ index 000000000..a1512eb06
 +		btree_node_mem_ptr_set(trans, path, level + 1, b);
 +
 +	if (btree_node_read_locked(path, level + 1))
-+		btree_node_unlock(path, level + 1);
++		btree_node_unlock(trans, path, level + 1);
 +	path->level = level;
 +
 +	bch2_btree_path_verify_locks(path);
@@ -21909,11 +22041,11 @@ index 000000000..a1512eb06
 +	int i, ret = 0;
 +
 +	if (trans->in_traverse_all)
-+		return -EINTR;
++		return -BCH_ERR_transaction_restart_in_traverse_all;
 +
 +	trans->in_traverse_all = true;
 +retry_all:
-+	trans->restarted = false;
++	trans->restarted = 0;
 +	trans->traverse_all_idx = U8_MAX;
 +
 +	trans_for_each_path(trans, path)
@@ -21957,7 +22089,8 @@ index 000000000..a1512eb06
 +		 */
 +		if (path->uptodate) {
 +			ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
-+			if (ret == -EINTR || ret == -ENOMEM)
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
++			    ret == -ENOMEM)
 +				goto retry_all;
 +			if (ret)
 +				goto err;
@@ -21998,9 +22131,10 @@ index 000000000..a1512eb06
 +	return true;
 +}
 +
-+static void btree_path_set_level_up(struct btree_path *path)
++static void btree_path_set_level_up(struct btree_trans *trans,
++				    struct btree_path *path)
 +{
-+	btree_node_unlock(path, path->level);
++	btree_node_unlock(trans, path, path->level);
 +	path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
 +	path->level++;
 +	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
@@ -22016,7 +22150,7 @@ index 000000000..a1512eb06
 +
 +	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
 +		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
-+			btree_node_unlock(path, l);
++			btree_node_unlock(trans, path, l);
 +
 +	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 +	bch2_btree_path_verify(trans, path);
@@ -22030,7 +22164,7 @@ index 000000000..a1512eb06
 +
 +	while (btree_path_node(path, l) &&
 +	       !btree_path_good_node(trans, path, l, check_pos)) {
-+		btree_node_unlock(path, l);
++		btree_node_unlock(trans, path, l);
 +		path->l[l].b = BTREE_ITER_NO_NODE_UP;
 +		l++;
 +	}
@@ -22041,7 +22175,7 @@ index 000000000..a1512eb06
 +	     i++)
 +		if (!bch2_btree_node_relock(trans, path, i))
 +			while (l <= i) {
-+				btree_node_unlock(path, l);
++				btree_node_unlock(trans, path, l);
 +				path->l[l].b = BTREE_ITER_NO_NODE_UP;
 +				l++;
 +			}
@@ -22064,19 +22198,17 @@ index 000000000..a1512eb06
 +				   unsigned long trace_ip)
 +{
 +	unsigned depth_want = path->level;
-+	int ret = 0;
++	int ret = trans->restarted;
 +
-+	if (unlikely(trans->restarted)) {
-+		ret = -EINTR;
++	if (unlikely(ret))
 +		goto out;
-+	}
 +
 +	/*
 +	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
 +	 * and re-traverse the path without a transaction restart:
 +	 */
 +	if (path->should_be_locked) {
-+		ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
++		ret = bch2_btree_path_relock(trans, path, trace_ip);
 +		goto out;
 +	}
 +
@@ -22110,7 +22242,7 @@ index 000000000..a1512eb06
 +				goto out;
 +			}
 +
-+			__bch2_btree_path_unlock(path);
++			__bch2_btree_path_unlock(trans, path);
 +			path->level = depth_want;
 +
 +			if (ret == -EIO)
@@ -22125,7 +22257,7 @@ index 000000000..a1512eb06
 +
 +	path->uptodate = BTREE_ITER_UPTODATE;
 +out:
-+	BUG_ON((ret == -EINTR) != !!trans->restarted);
++	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
 +	bch2_btree_path_verify(trans, path);
 +	return ret;
 +}
@@ -22133,6 +22265,16 @@ index 000000000..a1512eb06
 +int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 +					  struct btree_path *path, unsigned flags)
 +{
++	if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
++		unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
++		u64 mask = ~(~0ULL << restart_probability_bits);
++
++		if ((prandom_u32() & mask) == mask) {
++			trace_transaction_restart_injected(trans->fn, _RET_IP_);
++			return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
++		}
++	}
++
 +	if (path->uptodate < BTREE_ITER_NEED_RELOCK)
 +		return 0;
 +
@@ -22207,7 +22349,7 @@ index 000000000..a1512eb06
 +	bch2_btree_path_check_sort(trans, path, cmp);
 +
 +	if (unlikely(path->cached)) {
-+		btree_node_unlock(path, 0);
++		btree_node_unlock(trans, path, 0);
 +		path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
 +		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 +		goto out;
@@ -22230,7 +22372,7 @@ index 000000000..a1512eb06
 +
 +	if (l != path->level) {
 +		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-+		__bch2_btree_path_unlock(path);
++		__bch2_btree_path_unlock(trans, path);
 +	}
 +out:
 +	bch2_btree_path_verify(trans, path);
@@ -22271,7 +22413,7 @@ index 000000000..a1512eb06
 +
 +static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
 +{
-+	__bch2_btree_path_unlock(path);
++	__bch2_btree_path_unlock(trans, path);
 +	btree_path_list_remove(trans, path);
 +	trans->paths_allocated &= ~(1ULL << path->idx);
 +}
@@ -22609,26 +22751,25 @@ index 000000000..a1512eb06
 +
 +	/* got to end? */
 +	if (!btree_path_node(path, path->level + 1)) {
-+		btree_path_set_level_up(path);
++		btree_path_set_level_up(trans, path);
 +		return NULL;
 +	}
 +
 +	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
-+		__bch2_btree_path_unlock(path);
++		__bch2_btree_path_unlock(trans, path);
 +		path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
 +		path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
 +		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 +		trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
 +					   path->btree_id, &path->pos);
-+		btree_trans_restart(trans);
-+		ret = -EINTR;
++		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
 +		goto err;
 +	}
 +
 +	b = btree_path_node(path, path->level + 1);
 +
 +	if (!bpos_cmp(iter->pos, b->key.k.p)) {
-+		btree_node_unlock(path, path->level);
++		btree_node_unlock(trans, path, path->level);
 +		path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
 +		path->level++;
 +	} else {
@@ -22997,8 +23138,8 @@ index 000000000..a1512eb06
 +out:
 +	if (iter->update_path) {
 +		if (iter->update_path->uptodate &&
-+		    !bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)) {
-+			k = bkey_s_c_err(-EINTR);
++		    (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) {
++			k = bkey_s_c_err(ret);
 +		} else {
 +			BUG_ON(!(iter->update_path->nodes_locked & 1));
 +			iter->update_path->should_be_locked = true;
@@ -23065,7 +23206,7 @@ index 000000000..a1512eb06
 +		    (iter->advanced &&
 +		     !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
 +			iter->pos = path_l(iter->path)->b->key.k.p;
-+			btree_path_set_level_up(iter->path);
++			btree_path_set_level_up(trans, iter->path);
 +			iter->advanced = false;
 +			continue;
 +		}
@@ -23165,13 +23306,13 @@ index 000000000..a1512eb06
 +			goto out;
 +		}
 +
-+		k = btree_path_level_peek(trans->c, iter->path,
++		k = btree_path_level_peek(trans, iter->path,
 +					  &iter->path->l[0], &iter->k);
 +		if (!k.k ||
 +		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
 +		     ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
 +		     : bpos_cmp(k.k->p, search_key) > 0))
-+			k = btree_path_level_prev(trans->c, iter->path,
++			k = btree_path_level_prev(trans, iter->path,
 +						  &iter->path->l[0], &iter->k);
 +
 +		bch2_btree_path_check_sort(trans, iter->path, 0);
@@ -23655,8 +23796,7 @@ index 000000000..a1512eb06
 +
 +		if (old_bytes) {
 +			trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
-+			btree_trans_restart(trans);
-+			return ERR_PTR(-EINTR);
++			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
 +		}
 +	}
 +
@@ -23670,11 +23810,11 @@ index 000000000..a1512eb06
 + * bch2_trans_begin() - reset a transaction after a interrupted attempt
 + * @trans: transaction to reset
 + *
-+ * While iterating over nodes or updating nodes a attempt to lock a btree
-+ * node may return EINTR when the trylock fails. When this occurs
-+ * bch2_trans_begin() should be called and the transaction retried.
++ * While iterating over nodes or updating nodes a attempt to lock a btree node
++ * may return BCH_ERR_transaction_restart when the trylock fails. When this
++ * occurs bch2_trans_begin() should be called and the transaction retried.
 + */
-+void bch2_trans_begin(struct btree_trans *trans)
++u32 bch2_trans_begin(struct btree_trans *trans)
 +{
 +	struct btree_path *path;
 +
@@ -23712,12 +23852,28 @@ index 000000000..a1512eb06
 +			path->preserve = false;
 +	}
 +
-+	bch2_trans_cond_resched(trans);
++	if (!trans->restarted &&
++	    (need_resched() ||
++	     ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
++		bch2_trans_unlock(trans);
++		cond_resched();
++		bch2_trans_relock(trans);
++	}
 +
++	trans->last_restarted_ip = _RET_IP_;
 +	if (trans->restarted)
 +		bch2_btree_path_traverse_all(trans);
 +
-+	trans->restarted = false;
++	trans->last_begin_time = ktime_get_ns();
++	return trans->restart_count;
++}
++
++void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
++{
++	bch2_trans_inconsistent_on(trans_was_restarted(trans, restart_count), trans,
++		"trans->restart_count %u, should be %u, last restarted by %ps\n",
++		trans->restart_count, restart_count,
++		(void *) trans->last_restarted_ip);
 +}
 +
 +static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
@@ -23751,8 +23907,18 @@ index 000000000..a1512eb06
 +	memset(trans, 0, sizeof(*trans));
 +	trans->c		= c;
 +	trans->fn		= fn;
++	trans->last_begin_time	= ktime_get_ns();
 +	trans->task		= current;
 +
++	while (c->lock_held_stats.names[trans->lock_name_idx] != fn
++	       && c->lock_held_stats.names[trans->lock_name_idx] != 0)
++		trans->lock_name_idx++;
++
++	if (trans->lock_name_idx >= BCH_LOCK_TIME_NR)
++		pr_warn_once("lock_times array not big enough!");
++	else
++		c->lock_held_stats.names[trans->lock_name_idx] = fn;
++
 +	bch2_trans_alloc_paths(trans, c);
 +
 +	if (expected_mem_bytes) {
@@ -23942,10 +24108,10 @@ index 000000000..a1512eb06
 +}
 diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
 new file mode 100644
-index 000000000..9da0a4152
+index 000000000000..1b02f75d4cab
 --- /dev/null
 +++ b/fs/bcachefs/btree_iter.h
-@@ -0,0 +1,411 @@
+@@ -0,0 +1,556 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_ITER_H
 +#define _BCACHEFS_BTREE_ITER_H
@@ -23953,6 +24119,8 @@ index 000000000..9da0a4152
 +#include "bset.h"
 +#include "btree_types.h"
 +
++#include <trace/events/bcachefs.h>
++
 +static inline void __btree_path_get(struct btree_path *path, bool intent)
 +{
 +	path->ref++;
@@ -24107,19 +24275,36 @@ index 000000000..9da0a4152
 +			      struct btree *, struct btree_node_iter *,
 +			      struct bkey_packed *, unsigned, unsigned);
 +
-+bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
++int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
 +
 +void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 +
-+bool bch2_trans_relock(struct btree_trans *);
++int bch2_trans_relock(struct btree_trans *);
 +void bch2_trans_unlock(struct btree_trans *);
 +
-+__always_inline
-+static inline int btree_trans_restart(struct btree_trans *trans)
++static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
 +{
-+	trans->restarted = true;
-+	bch2_trans_unlock(trans);
-+	return -EINTR;
++	return restart_count != trans->restart_count;
++}
++
++void bch2_trans_verify_not_restarted(struct btree_trans *, u32);
++
++__always_inline
++static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
++{
++	BUG_ON(err <= 0);
++	BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
++
++	trans->restarted = err;
++	trans->restart_count++;
++	return -err;
++}
++
++__always_inline
++static inline int btree_trans_restart(struct btree_trans *trans, int err)
++{
++	btree_trans_restart_nounlock(trans, err);
++	return -err;
 +}
 +
 +bool bch2_btree_node_upgrade(struct btree_trans *,
@@ -24139,14 +24324,15 @@ index 000000000..9da0a4152
 +		: path->uptodate == BTREE_ITER_UPTODATE;
 +}
 +
-+void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
++void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
 +
-+static inline void bch2_btree_path_downgrade(struct btree_path *path)
++static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
++					     struct btree_path *path)
 +{
 +	unsigned new_locks_want = path->level + !!path->intent_ref;
 +
 +	if (path->locks_want > new_locks_want)
-+		__bch2_btree_path_downgrade(path, new_locks_want);
++		__bch2_btree_path_downgrade(trans, path, new_locks_want);
 +}
 +
 +void bch2_trans_downgrade(struct btree_trans *);
@@ -24231,7 +24417,7 @@ index 000000000..9da0a4152
 +}
 +
 +void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-+void bch2_trans_begin(struct btree_trans *);
++u32 bch2_trans_begin(struct btree_trans *);
 +
 +static inline struct btree *
 +__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
@@ -24239,7 +24425,7 @@ index 000000000..9da0a4152
 +	struct btree *b;
 +
 +	while (b = bch2_btree_iter_peek_node(iter),
-+	       PTR_ERR_OR_ZERO(b) == -EINTR)
++	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
 +		bch2_trans_begin(trans);
 +
 +	return b;
@@ -24263,6 +24449,15 @@ index 000000000..9da0a4152
 +	return PTR_ERR_OR_ZERO(k.k);
 +}
 +
++static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
++							     unsigned flags)
++{
++	BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
++
++	return  flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
++						bch2_btree_iter_peek_prev(iter);
++}
++
 +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
 +							unsigned flags)
 +{
@@ -24286,8 +24481,12 @@ index 000000000..9da0a4152
 +
 +static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 +{
-+	return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
-+		? -EINTR : 0;
++	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX) {
++		trace_trans_restart_too_many_iters(trans->fn, _THIS_IP_);
++		return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
++	}
++
++	return 0;
 +}
 +
 +static inline struct bkey_s_c
@@ -24298,12 +24497,124 @@ index 000000000..9da0a4152
 +
 +	while (btree_trans_too_many_iters(trans) ||
 +	       (k = bch2_btree_iter_peek_type(iter, flags),
-+		bkey_err(k) == -EINTR))
++		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
 +		bch2_trans_begin(trans);
 +
 +	return k;
 +}
 +
++#define lockrestart_do(_trans, _do)					\
++({									\
++	u32 _restart_count;						\
++	int _ret;							\
++									\
++	do {								\
++		_restart_count = bch2_trans_begin(_trans);		\
++		_ret = (_do);						\
++	} while (bch2_err_matches(_ret, BCH_ERR_transaction_restart));	\
++									\
++	if (!_ret)							\
++		bch2_trans_verify_not_restarted(_trans, _restart_count);\
++									\
++	_ret;								\
++})
++
++/*
++ * nested_lockrestart_do(), nested_commit_do():
++ *
++ * These are like lockrestart_do() and commit_do(), with two differences:
++ *
++ *  - We don't call bch2_trans_begin() unless we had a transaction restart
++ *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
++ *  transaction restart
++ */
++#define nested_lockrestart_do(_trans, _do)				\
++({									\
++	u32 _restart_count, _orig_restart_count;			\
++	int _ret;							\
++									\
++	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
++									\
++	while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
++		_restart_count = bch2_trans_begin(_trans);		\
++									\
++	if (!_ret)							\
++		bch2_trans_verify_not_restarted(_trans, _restart_count);\
++									\
++	if (!_ret && trans_was_restarted(_trans, _orig_restart_count))	\
++		_ret = -BCH_ERR_transaction_restart_nested;		\
++									\
++	_ret;								\
++})
++
++#define for_each_btree_key2(_trans, _iter, _btree_id,			\
++			    _start, _flags, _k, _do)			\
++({									\
++	int _ret = 0;							\
++									\
++	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
++			     (_start), (_flags));			\
++									\
++	while (1) {							\
++		u32 _restart_count = bch2_trans_begin(_trans);		\
++		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
++		if (!(_k).k) {						\
++			_ret = 0;					\
++			break;						\
++		}							\
++									\
++		_ret = bkey_err(_k) ?: (_do);				\
++		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
++			continue;					\
++		if (_ret)						\
++			break;						\
++		bch2_trans_verify_not_restarted(_trans, _restart_count);\
++		if (!bch2_btree_iter_advance(&(_iter)))			\
++			break;						\
++	}								\
++									\
++	bch2_trans_iter_exit((_trans), &(_iter));			\
++	_ret;								\
++})
++
++#define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
++				   _start, _flags, _k, _do)		\
++({									\
++	int _ret = 0;							\
++									\
++	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
++			     (_start), (_flags));			\
++									\
++	while (1) {							\
++		u32 _restart_count = bch2_trans_begin(_trans);		\
++		(_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
++		if (!(_k).k) {						\
++			_ret = 0;					\
++			break;						\
++		}							\
++									\
++		_ret = bkey_err(_k) ?: (_do);				\
++		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
++			continue;					\
++		if (_ret)						\
++			break;						\
++		bch2_trans_verify_not_restarted(_trans, _restart_count);\
++		if (!bch2_btree_iter_rewind(&(_iter)))			\
++			break;						\
++	}								\
++									\
++	bch2_trans_iter_exit((_trans), &(_iter));			\
++	_ret;								\
++})
++
++#define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
++				  _start, _iter_flags, _k,		\
++				  _disk_res, _journal_seq, _commit_flags,\
++				  _do)					\
++	for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
++			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
++					(_journal_seq), (_commit_flags)))
++
 +#define for_each_btree_key(_trans, _iter, _btree_id,			\
 +			   _start, _flags, _k, _ret)			\
 +	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
@@ -24359,10 +24670,10 @@ index 000000000..9da0a4152
 +#endif /* _BCACHEFS_BTREE_ITER_H */
 diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
 new file mode 100644
-index 000000000..a5b0a956e
+index 000000000000..661006e427f2
 --- /dev/null
 +++ b/fs/bcachefs/btree_key_cache.c
-@@ -0,0 +1,850 @@
+@@ -0,0 +1,855 @@
 +
 +#include "bcachefs.h"
 +#include "btree_cache.h"
@@ -24370,6 +24681,7 @@ index 000000000..a5b0a956e
 +#include "btree_key_cache.h"
 +#include "btree_locking.h"
 +#include "btree_update.h"
++#include "errcode.h"
 +#include "error.h"
 +#include "journal.h"
 +#include "journal_reclaim.h"
@@ -24657,7 +24969,7 @@ index 000000000..a5b0a956e
 +	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 +		trace_trans_restart_relock_key_cache_fill(trans->fn,
 +				_THIS_IP_, ck_path->btree_id, &ck_path->pos);
-+		ret = btree_trans_restart(trans);
++		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 +		goto err;
 +	}
 +
@@ -24712,8 +25024,10 @@ index 000000000..a5b0a956e
 +	struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
 +	const struct btree_path *path = p;
 +
-+	return ck->key.btree_id == path->btree_id &&
-+		!bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
++	if (ck->key.btree_id != path->btree_id &&
++	    bpos_cmp(ck->key.pos, path->pos))
++		return BCH_ERR_lock_fail_node_reused;
++	return 0;
 +}
 +
 +__flatten
@@ -24752,14 +25066,15 @@ index 000000000..a5b0a956e
 +	} else {
 +		enum six_lock_type lock_want = __btree_lock_want(path, 0);
 +
-+		if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
-+				     lock_want,
-+				     bkey_cached_check_fn, path, _THIS_IP_)) {
-+			if (!trans->restarted)
++		ret = btree_node_lock(trans, path, (void *) ck, path->pos, 0,
++				      lock_want,
++				      bkey_cached_check_fn, path, _THIS_IP_);
++		if (ret) {
++			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
 +				goto retry;
-+
-+			ret = -EINTR;
-+			goto err;
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				goto err;
++			BUG();
 +		}
 +
 +		if (ck->key.btree_id != path->btree_id ||
@@ -24778,7 +25093,7 @@ index 000000000..a5b0a956e
 +		if (!path->locks_want &&
 +		    !__bch2_btree_path_upgrade(trans, path, 1)) {
 +			trace_transaction_restart_ip(trans->fn, _THIS_IP_);
-+			ret = btree_trans_restart(trans);
++			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 +			goto err;
 +		}
 +
@@ -24795,8 +25110,8 @@ index 000000000..a5b0a956e
 +
 +	return ret;
 +err:
-+	if (ret != -EINTR) {
-+		btree_node_unlock(path, 0);
++	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
++		btree_node_unlock(trans, path, 0);
 +		path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
 +	}
 +	return ret;
@@ -24862,13 +25177,14 @@ index 000000000..a5b0a956e
 +				   ? JOURNAL_WATERMARK_reserved
 +				   : 0)|
 +				  commit_flags);
-+	if (ret) {
-+		bch2_fs_fatal_err_on(ret != -EINTR &&
-+				     ret != -EAGAIN &&
-+				     !bch2_journal_error(j), c,
-+			"error flushing key cache: %i", ret);
++
++	bch2_fs_fatal_err_on(ret &&
++			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
++			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
++			     !bch2_journal_error(j), c,
++			     "error flushing key cache: %s", bch2_err_str(ret));
++	if (ret)
 +		goto out;
-+	}
 +
 +	bch2_journal_pin_drop(j, &ck->journal);
 +	bch2_journal_preres_put(j, &ck->res);
@@ -25215,7 +25531,7 @@ index 000000000..a5b0a956e
 +}
 diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
 new file mode 100644
-index 000000000..670746e72
+index 000000000000..670746e72dab
 --- /dev/null
 +++ b/fs/bcachefs/btree_key_cache.h
 @@ -0,0 +1,47 @@
@@ -25268,10 +25584,10 @@ index 000000000..670746e72
 +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
 diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
 new file mode 100644
-index 000000000..67c970d72
+index 000000000000..49eef650e436
 --- /dev/null
 +++ b/fs/bcachefs/btree_locking.h
-@@ -0,0 +1,259 @@
+@@ -0,0 +1,289 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_LOCKING_H
 +#define _BCACHEFS_BTREE_LOCKING_H
@@ -25332,7 +25648,7 @@ index 000000000..67c970d72
 +	path->nodes_intent_locked &= ~(1 << level);
 +}
 +
-+static inline void mark_btree_node_locked(struct btree_trans *trans,
++static inline void mark_btree_node_locked_noreset(struct btree_trans *trans,
 +					  struct btree_path *path,
 +					  unsigned level,
 +					  enum six_lock_type type)
@@ -25347,11 +25663,22 @@ index 000000000..67c970d72
 +	path->nodes_intent_locked |= type << level;
 +}
 +
++static inline void mark_btree_node_locked(struct btree_trans *trans,
++					  struct btree_path *path,
++					  unsigned level,
++					  enum six_lock_type type)
++{
++	mark_btree_node_locked_noreset(trans, path, level, type);
++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
++	path->l[level].lock_taken_time = ktime_get_ns();
++#endif
++}
++
 +static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
 +						 struct btree_path *path,
 +						 unsigned level)
 +{
-+	mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
++	mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent);
 +}
 +
 +static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -25373,23 +25700,35 @@ index 000000000..67c970d72
 +	return BTREE_NODE_UNLOCKED;
 +}
 +
-+static inline void btree_node_unlock(struct btree_path *path, unsigned level)
++static inline void btree_node_unlock(struct btree_trans *trans,
++				     struct btree_path *path, unsigned level)
 +{
 +	int lock_type = btree_node_locked_type(path, level);
 +
 +	EBUG_ON(level >= BTREE_MAX_DEPTH);
 +
-+	if (lock_type != BTREE_NODE_UNLOCKED)
++	if (lock_type != BTREE_NODE_UNLOCKED) {
 +		six_unlock_type(&path->l[level].b->c.lock, lock_type);
++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
++		if (trans->lock_name_idx < BCH_LOCK_TIME_NR) {
++			struct bch_fs *c = trans->c;
++
++			__bch2_time_stats_update(&c->lock_held_stats.times[trans->lock_name_idx],
++					       path->l[level].lock_taken_time,
++						 ktime_get_ns());
++		}
++#endif
++	}
 +	mark_btree_node_unlocked(path, level);
 +}
 +
-+static inline void __bch2_btree_path_unlock(struct btree_path *path)
++static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
++					    struct btree_path *path)
 +{
 +	btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
 +
 +	while (path->nodes_locked)
-+		btree_node_unlock(path, __ffs(path->nodes_locked));
++		btree_node_unlock(trans, path, __ffs(path->nodes_locked));
 +}
 +
 +static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
@@ -25406,7 +25745,7 @@ index 000000000..67c970d72
 +	}
 +}
 +
-+static inline bool btree_node_lock_type(struct btree_trans *trans,
++static inline int btree_node_lock_type(struct btree_trans *trans,
 +				       struct btree_path *path,
 +				       struct btree *b,
 +				       struct bpos pos, unsigned level,
@@ -25415,10 +25754,10 @@ index 000000000..67c970d72
 +{
 +	struct bch_fs *c = trans->c;
 +	u64 start_time;
-+	bool ret;
++	int ret;
 +
 +	if (six_trylock_type(&b->c.lock, type))
-+		return true;
++		return 0;
 +
 +	start_time = local_clock();
 +
@@ -25428,13 +25767,14 @@ index 000000000..67c970d72
 +	trans->locking_level	= level;
 +	trans->locking_lock_type = type;
 +	trans->locking		= b;
-+	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
++	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p);
 +	trans->locking = NULL;
 +
 +	if (ret)
-+		bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
++		return ret;
 +
-+	return ret;
++	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
++	return 0;
 +}
 +
 +/*
@@ -25457,26 +25797,34 @@ index 000000000..67c970d72
 +	return false;
 +}
 +
-+bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
-+			    struct btree *, struct bpos, unsigned,
-+			    enum six_lock_type,
-+			    six_lock_should_sleep_fn, void *,
-+			    unsigned long);
++int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
++			   struct btree *, struct bpos, unsigned,
++			   enum six_lock_type,
++			   six_lock_should_sleep_fn, void *,
++			   unsigned long);
 +
-+static inline bool btree_node_lock(struct btree_trans *trans,
++static inline int btree_node_lock(struct btree_trans *trans,
 +			struct btree_path *path,
 +			struct btree *b, struct bpos pos, unsigned level,
 +			enum six_lock_type type,
 +			six_lock_should_sleep_fn should_sleep_fn, void *p,
 +			unsigned long ip)
 +{
++	int ret = 0;
++
 +	EBUG_ON(level >= BTREE_MAX_DEPTH);
 +	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 +
-+	return likely(six_trylock_type(&b->c.lock, type)) ||
-+		btree_node_lock_increment(trans, b, level, type) ||
-+		__bch2_btree_node_lock(trans, path, b, pos, level, type,
-+				       should_sleep_fn, p, ip);
++	if (likely(six_trylock_type(&b->c.lock, type)) ||
++	    btree_node_lock_increment(trans, b, level, type) ||
++	    !(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type,
++					   should_sleep_fn, p, ip))) {
++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
++		path->l[b->c.level].lock_taken_time = ktime_get_ns();
++#endif
++	}
++
++	return ret;
 +}
 +
 +bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
@@ -25529,14 +25877,12 @@ index 000000000..67c970d72
 +}
 +
 +#endif /* _BCACHEFS_BTREE_LOCKING_H */
-+
-+
 diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
 new file mode 100644
-index 000000000..1e4d1fecc
+index 000000000000..a2826dfe13cb
 --- /dev/null
 +++ b/fs/bcachefs/btree_types.h
-@@ -0,0 +1,687 @@
+@@ -0,0 +1,697 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_TYPES_H
 +#define _BCACHEFS_BTREE_TYPES_H
@@ -25790,6 +26136,9 @@ index 000000000..1e4d1fecc
 +		struct btree	*b;
 +		struct btree_node_iter iter;
 +		u32		lock_seq;
++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
++		u64             lock_taken_time;
++#endif
 +	}			l[BTREE_MAX_DEPTH];
 +#ifdef CONFIG_BCACHEFS_DEBUG
 +	unsigned long		ip_allocated;
@@ -25923,10 +26272,13 @@ index 000000000..1e4d1fecc
 +
 +#define BTREE_TRANS_MEM_MAX	(1U << 16)
 +
++#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS	10000
++
 +struct btree_trans {
 +	struct bch_fs		*c;
 +	const char		*fn;
 +	struct list_head	list;
++	u64			last_begin_time;
 +	struct btree		*locking;
 +	unsigned		locking_path_idx;
 +	struct bpos		locking_pos;
@@ -25941,9 +26293,12 @@ index 000000000..1e4d1fecc
 +	u8			traverse_all_idx;
 +	bool			used_mempool:1;
 +	bool			in_traverse_all:1;
-+	bool			restarted:1;
 +	bool			memory_allocation_failure:1;
 +	bool			is_initial_gc:1;
++	enum bch_errcode	restarted:16;
++	u32			restart_count;
++	unsigned long		last_restarted_ip;
++
 +	/*
 +	 * For when bch2_trans_update notices we'll be splitting a compressed
 +	 * extent:
@@ -25973,6 +26328,7 @@ index 000000000..1e4d1fecc
 +	unsigned		journal_u64s;
 +	unsigned		journal_preres_u64s;
 +	struct replicas_delta_list *fs_usage_deltas;
++	int                      lock_name_idx;
 +};
 +
 +#define BTREE_FLAGS()							\
@@ -26226,10 +26582,10 @@ index 000000000..1e4d1fecc
 +#endif /* _BCACHEFS_BTREE_TYPES_H */
 diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
 new file mode 100644
-index 000000000..28f958577
+index 000000000000..89941fb8caa0
 --- /dev/null
 +++ b/fs/bcachefs/btree_update.h
-@@ -0,0 +1,156 @@
+@@ -0,0 +1,158 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_UPDATE_H
 +#define _BCACHEFS_BTREE_UPDATE_H
@@ -26322,7 +26678,6 @@ index 000000000..28f958577
 + * This is main entry point for btree updates.
 + *
 + * Return values:
-+ * -EINTR: locking changed, this function should be called again.
 + * -EROFS: filesystem read only
 + * -EIO: journal or btree node IO error
 + */
@@ -26338,30 +26693,33 @@ index 000000000..28f958577
 +	return __bch2_trans_commit(trans);
 +}
 +
-+#define lockrestart_do(_trans, _do)					\
-+({									\
-+	int _ret;							\
-+									\
-+	do {								\
-+		bch2_trans_begin(_trans);				\
-+		_ret = (_do);						\
-+	} while (_ret == -EINTR);					\
-+									\
-+	_ret;								\
-+})
-+
-+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
++#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
 +	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
 +					(_journal_seq), (_flags)))
 +
++#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
++	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
++					(_journal_seq), (_flags)))
++
 +#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
 +({									\
 +	struct btree_trans trans;					\
 +	int _ret;							\
 +									\
 +	bch2_trans_init(&trans, (_c), 0, 0);				\
-+	_ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags,	\
-+			       _do);					\
++	_ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do);	\
++	bch2_trans_exit(&trans);					\
++									\
++	_ret;								\
++})
++
++#define bch2_trans_run(_c, _do)						\
++({									\
++	struct btree_trans trans;					\
++	int _ret;							\
++									\
++	bch2_trans_init(&trans, (_c), 0, 0);				\
++	_ret = (_do);							\
 +	bch2_trans_exit(&trans);					\
 +									\
 +	_ret;								\
@@ -26388,10 +26746,10 @@ index 000000000..28f958577
 +#endif /* _BCACHEFS_BTREE_UPDATE_H */
 diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
 new file mode 100644
-index 000000000..965fdfbfa
+index 000000000000..5525635ec04a
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_interior.c
-@@ -0,0 +1,2253 @@
+@@ -0,0 +1,2266 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -26572,12 +26930,13 @@ index 000000000..965fdfbfa
 +	six_unlock_intent(&b->c.lock);
 +}
 +
-+static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
++static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 +					     struct disk_reservation *res,
 +					     struct closure *cl,
 +					     bool interior_node,
 +					     unsigned flags)
 +{
++	struct bch_fs *c = trans->c;
 +	struct write_point *wp;
 +	struct btree *b;
 +	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
@@ -26607,7 +26966,7 @@ index 000000000..965fdfbfa
 +	mutex_unlock(&c->btree_reserve_cache_lock);
 +
 +retry:
-+	wp = bch2_alloc_sectors_start(c,
++	wp = bch2_alloc_sectors_start_trans(trans,
 +				      c->opts.metadata_target ?:
 +				      c->opts.foreground_target,
 +				      0,
@@ -26806,18 +27165,16 @@ index 000000000..965fdfbfa
 +	}
 +}
 +
-+static int bch2_btree_reserve_get(struct btree_update *as,
++static int bch2_btree_reserve_get(struct btree_trans *trans,
++				  struct btree_update *as,
 +				  unsigned nr_nodes[2],
-+				  unsigned flags)
++				  unsigned flags,
++				  struct closure *cl)
 +{
 +	struct bch_fs *c = as->c;
-+	struct closure cl;
 +	struct btree *b;
 +	unsigned interior;
-+	int ret;
-+
-+	closure_init_stack(&cl);
-+retry:
++	int ret = 0;
 +
 +	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
 +
@@ -26828,18 +27185,17 @@ index 000000000..965fdfbfa
 +	 * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
 +	 * blocking on this lock:
 +	 */
-+	ret = bch2_btree_cache_cannibalize_lock(c, &cl);
++	ret = bch2_btree_cache_cannibalize_lock(c, cl);
 +	if (ret)
-+		goto err;
++		return ret;
 +
 +	for (interior = 0; interior < 2; interior++) {
 +		struct prealloc_nodes *p = as->prealloc_nodes + interior;
 +
 +		while (p->nr < nr_nodes[interior]) {
-+			b = __bch2_btree_node_alloc(c, &as->disk_res,
-+						    flags & BTREE_INSERT_NOWAIT
-+						    ? NULL : &cl,
-+						    interior, flags);
++			b = __bch2_btree_node_alloc(trans, &as->disk_res,
++					flags & BTREE_INSERT_NOWAIT ? NULL : cl,
++					interior, flags);
 +			if (IS_ERR(b)) {
 +				ret = PTR_ERR(b);
 +				goto err;
@@ -26848,18 +27204,8 @@ index 000000000..965fdfbfa
 +			p->b[p->nr++] = b;
 +		}
 +	}
-+
-+	bch2_btree_cache_cannibalize_unlock(c);
-+	closure_sync(&cl);
-+	return 0;
 +err:
 +	bch2_btree_cache_cannibalize_unlock(c);
-+	closure_sync(&cl);
-+
-+	if (ret == -EAGAIN)
-+		goto retry;
-+
-+	trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], &cl);
 +	return ret;
 +}
 +
@@ -27004,7 +27350,7 @@ index 000000000..965fdfbfa
 +	 * which may require allocations as well.
 +	 */
 +	bch2_trans_init(&trans, c, 0, 512);
-+	ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
++	ret = commit_do(&trans, &as->disk_res, &journal_seq,
 +			      BTREE_INSERT_NOFAIL|
 +			      BTREE_INSERT_NOCHECK_RW|
 +			      BTREE_INSERT_JOURNAL_RECLAIM|
@@ -27374,6 +27720,7 @@ index 000000000..965fdfbfa
 +	unsigned update_level = level;
 +	int journal_flags = flags & JOURNAL_WATERMARK_MASK;
 +	int ret = 0;
++	u32 restart_count = trans->restart_count;
 +
 +	BUG_ON(!path->should_be_locked);
 +
@@ -27401,7 +27748,7 @@ index 000000000..965fdfbfa
 +	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
 +		trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
 +						 path->btree_id, &path->pos);
-+		ret = btree_trans_restart(trans);
++		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 +		return ERR_PTR(ret);
 +	}
 +
@@ -27410,9 +27757,10 @@ index 000000000..965fdfbfa
 +	else if (!down_read_trylock(&c->gc_lock)) {
 +		bch2_trans_unlock(trans);
 +		down_read(&c->gc_lock);
-+		if (!bch2_trans_relock(trans)) {
++		ret = bch2_trans_relock(trans);
++		if (ret) {
 +			up_read(&c->gc_lock);
-+			return ERR_PTR(-EINTR);
++			return ERR_PTR(ret);
 +		}
 +	}
 +
@@ -27447,16 +27795,24 @@ index 000000000..965fdfbfa
 +	if (ret)
 +		goto err;
 +
-+	bch2_trans_unlock(trans);
-+
 +	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
 +				      BTREE_UPDATE_JOURNAL_RES,
-+				      journal_flags);
++				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
 +	if (ret) {
-+		bch2_btree_update_free(as);
-+		trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
-+		btree_trans_restart(trans);
-+		return ERR_PTR(ret);
++		bch2_trans_unlock(trans);
++
++		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
++					      BTREE_UPDATE_JOURNAL_RES,
++					      journal_flags);
++		if (ret) {
++			trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
++			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
++			goto err;
++		}
++
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			goto err;
 +	}
 +
 +	ret = bch2_disk_reservation_get(c, &as->disk_res,
@@ -27466,15 +27822,31 @@ index 000000000..965fdfbfa
 +	if (ret)
 +		goto err;
 +
-+	ret = bch2_btree_reserve_get(as, nr_nodes, flags);
++	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
++	if (ret && ret != -EINTR) {
++		struct closure cl;
++
++		closure_init_stack(&cl);
++
++		bch2_trans_unlock(trans);
++
++		do {
++			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
++			closure_sync(&cl);
++		} while (ret == -EAGAIN);
++
++		if (ret) {
++			trace_btree_reserve_get_fail(trans->fn, _RET_IP_,
++						     nr_nodes[0] + nr_nodes[1]);
++			goto err;
++		}
++	}
++
++	ret = bch2_trans_relock(trans);
 +	if (ret)
 +		goto err;
 +
-+	if (!bch2_trans_relock(trans)) {
-+		ret = -EINTR;
-+		goto err;
-+	}
-+
++	bch2_trans_verify_not_restarted(trans, restart_count);
 +	return as;
 +err:
 +	bch2_btree_update_free(as);
@@ -28224,7 +28596,7 @@ index 000000000..965fdfbfa
 +
 +	bch2_btree_update_done(as);
 +out:
-+	bch2_btree_path_downgrade(iter->path);
++	bch2_btree_path_downgrade(trans, iter->path);
 +	return ret;
 +}
 +
@@ -28337,7 +28709,7 @@ index 000000000..965fdfbfa
 +		BUG_ON(iter2.path->level != b->c.level);
 +		BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
 +
-+		btree_node_unlock(iter2.path, iter2.path->level);
++		btree_node_unlock(trans, iter2.path, iter2.path->level);
 +		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
 +		iter2.path->level++;
 +		btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE);
@@ -28411,10 +28783,8 @@ index 000000000..965fdfbfa
 +	int ret = 0;
 +
 +	if (!btree_node_intent_locked(path, b->c.level) &&
-+	    !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
-+		btree_trans_restart(trans);
-+		return -EINTR;
-+	}
++	    !bch2_btree_path_upgrade(trans, path, b->c.level + 1))
++		return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 +
 +	closure_init_stack(&cl);
 +
@@ -28427,8 +28797,9 @@ index 000000000..965fdfbfa
 +		if (ret) {
 +			bch2_trans_unlock(trans);
 +			closure_sync(&cl);
-+			if (!bch2_trans_relock(trans))
-+				return -EINTR;
++			ret = bch2_trans_relock(trans);
++			if (ret)
++				return ret;
 +		}
 +
 +		new_hash = bch2_btree_node_mem_alloc(c, false);
@@ -28647,7 +29018,7 @@ index 000000000..965fdfbfa
 +}
 diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
 new file mode 100644
-index 000000000..adfc6c24a
+index 000000000000..adfc6c24a7a4
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_interior.h
 @@ -0,0 +1,321 @@
@@ -28974,10 +29345,10 @@ index 000000000..adfc6c24a
 +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
 diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
 new file mode 100644
-index 000000000..aed26b579
+index 000000000000..e2ecbd3bca77
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_leaf.c
-@@ -0,0 +1,1815 @@
+@@ -0,0 +1,1800 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -28990,6 +29361,7 @@ index 000000000..aed26b579
 +#include "btree_locking.h"
 +#include "buckets.h"
 +#include "debug.h"
++#include "errcode.h"
 +#include "error.h"
 +#include "extent_update.h"
 +#include "journal.h"
@@ -29262,9 +29634,10 @@ index 000000000..aed26b579
 +	if (ret)
 +		return ret;
 +
-+	if (!bch2_trans_relock(trans)) {
++	ret = bch2_trans_relock(trans);
++	if (ret) {
 +		trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
-+		return -EINTR;
++		return ret;
 +	}
 +
 +	return 0;
@@ -29356,12 +29729,7 @@ index 000000000..aed26b579
 +	trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_,
 +					     path->btree_id, &path->pos,
 +					     old_u64s, new_u64s);
-+	/*
-+	 * Not using btree_trans_restart() because we can't unlock here, we have
-+	 * write locks held:
-+	 */
-+	trans->restarted = true;
-+	return -EINTR;
++	return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced);
 +}
 +
 +/* Triggers: */
@@ -29553,8 +29921,7 @@ index 000000000..aed26b579
 +
 +	if (race_fault()) {
 +		trace_trans_restart_fault_inject(trans->fn, trace_ip);
-+		trans->restarted = true;
-+		return -EINTR;
++		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
 +	}
 +
 +	/*
@@ -29786,6 +30153,7 @@ index 000000000..aed26b579
 +static inline int trans_lock_write(struct btree_trans *trans)
 +{
 +	struct btree_insert_entry *i;
++	int ret;
 +
 +	trans_for_each_update(trans, i) {
 +		if (same_leaf_as_prev(trans, i))
@@ -29795,10 +30163,11 @@ index 000000000..aed26b579
 +			if (have_conflicting_read_lock(trans, i->path))
 +				goto fail;
 +
-+			btree_node_lock_type(trans, i->path,
++			ret = btree_node_lock_type(trans, i->path,
 +					     insert_l(i)->b,
 +					     i->path->pos, i->level,
 +					     SIX_LOCK_write, NULL, NULL);
++			BUG_ON(ret);
 +		}
 +
 +		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
@@ -29814,7 +30183,7 @@ index 000000000..aed26b579
 +	}
 +
 +	trace_trans_restart_would_deadlock_write(trans->fn);
-+	return btree_trans_restart(trans);
++	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
 +}
 +
 +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
@@ -29945,10 +30314,7 @@ index 000000000..aed26b579
 +	switch (ret) {
 +	case BTREE_INSERT_BTREE_NODE_FULL:
 +		ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
-+		if (!ret)
-+			return 0;
-+
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			trace_trans_restart_btree_node_split(trans->fn, trace_ip,
 +						i->btree_id, &i->path->pos);
 +		break;
@@ -29959,19 +30325,16 @@ index 000000000..aed26b579
 +		if (ret)
 +			break;
 +
-+		if (bch2_trans_relock(trans))
-+			return 0;
-+
-+		trace_trans_restart_mark_replicas(trans->fn, trace_ip);
-+		ret = -EINTR;
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			trace_trans_restart_mark_replicas(trans->fn, trace_ip);
 +		break;
 +	case BTREE_INSERT_NEED_JOURNAL_RES:
 +		bch2_trans_unlock(trans);
 +
 +		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
 +		    !(trans->flags & JOURNAL_WATERMARK_reserved)) {
-+			trans->restarted = true;
-+			ret = -EAGAIN;
++			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 +			break;
 +		}
 +
@@ -29979,11 +30342,9 @@ index 000000000..aed26b579
 +		if (ret)
 +			break;
 +
-+		if (bch2_trans_relock(trans))
-+			return 0;
-+
-+		trace_trans_restart_journal_res_get(trans->fn, trace_ip);
-+		ret = -EINTR;
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			trace_trans_restart_journal_res_get(trans->fn, trace_ip);
 +		break;
 +	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 +		bch2_trans_unlock(trans);
@@ -29995,18 +30356,16 @@ index 000000000..aed26b579
 +		if (ret < 0)
 +			break;
 +
-+		if (bch2_trans_relock(trans))
-+			return 0;
-+
-+		trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
-+		ret = -EINTR;
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
 +		break;
 +	default:
 +		BUG_ON(ret >= 0);
 +		break;
 +	}
 +
-+	BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
++	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
 +	BUG_ON(ret == -ENOSPC &&
 +	       !(trans->flags & BTREE_INSERT_NOWAIT) &&
 +	       (trans->flags & BTREE_INSERT_NOFAIL));
@@ -30026,13 +30385,11 @@ index 000000000..aed26b579
 +
 +	bch2_trans_unlock(trans);
 +
-+	ret = bch2_fs_read_write_early(c);
++	ret =   bch2_fs_read_write_early(c) ?:
++		bch2_trans_relock(trans);
 +	if (ret)
 +		return ret;
 +
-+	if (!bch2_trans_relock(trans))
-+		return -EINTR;
-+
 +	percpu_ref_get(&c->writes);
 +	return 0;
 +}
@@ -30104,7 +30461,7 @@ index 000000000..aed26b579
 +		if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
 +			trace_trans_restart_upgrade(trans->fn, _RET_IP_,
 +						    i->btree_id, &i->path->pos);
-+			ret = btree_trans_restart(trans);
++			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 +			goto out;
 +		}
 +
@@ -30614,8 +30971,7 @@ index 000000000..aed26b579
 +
 +			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 +				trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
-+				btree_trans_restart(trans);
-+				return -EINTR;
++				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 +			}
 +
 +			iter->key_cache_path->should_be_locked = true;
@@ -30743,7 +31099,7 @@ index 000000000..aed26b579
 +			break;
 +	}
 +
-+	if (ret == -EINTR) {
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 +		ret = 0;
 +		goto retry;
 +	}
@@ -30795,10 +31151,10 @@ index 000000000..aed26b579
 +}
 diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
 new file mode 100644
-index 000000000..1ea7e2baf
+index 000000000000..b4be2122c2d5
 --- /dev/null
 +++ b/fs/bcachefs/buckets.c
-@@ -0,0 +1,2114 @@
+@@ -0,0 +1,2113 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Code for manipulating bucket marks for garbage collection.
@@ -31345,22 +31701,6 @@ index 000000000..1ea7e2baf
 +		}
 +	}
 +
-+	if (new_a.data_type == BCH_DATA_free &&
-+	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
-+		closure_wake_up(&c->freelist_wait);
-+
-+	if (new_a.data_type == BCH_DATA_need_discard &&
-+	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
-+		bch2_do_discards(c);
-+
-+	if (old_a.data_type != BCH_DATA_cached &&
-+	    new_a.data_type == BCH_DATA_cached &&
-+	    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
-+		bch2_do_invalidates(c);
-+
-+	if (new_a.data_type == BCH_DATA_need_gc_gens)
-+		bch2_do_gc_gens(c);
-+
 +	percpu_down_read(&c->mark_lock);
 +	if (!gc && new_a.gen != old_a.gen)
 +		*bucket_gen(ca, new.k->p.offset) = new_a.gen;
@@ -31400,6 +31740,22 @@ index 000000000..1ea7e2baf
 +		}
 +	}
 +
++	if (new_a.data_type == BCH_DATA_free &&
++	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
++		closure_wake_up(&c->freelist_wait);
++
++	if (new_a.data_type == BCH_DATA_need_discard &&
++	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
++		bch2_do_discards(c);
++
++	if (old_a.data_type != BCH_DATA_cached &&
++	    new_a.data_type == BCH_DATA_cached &&
++	    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
++		bch2_do_invalidates(c);
++
++	if (new_a.data_type == BCH_DATA_need_gc_gens)
++		bch2_do_gc_gens(c);
++
 +	return 0;
 +}
 +
@@ -32662,7 +33018,7 @@ index 000000000..1ea7e2baf
 +				    enum bch_data_type type,
 +				    unsigned sectors)
 +{
-+	return __bch2_trans_do(trans, NULL, NULL, 0,
++	return commit_do(trans, NULL, NULL, 0,
 +			__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
 +}
 +
@@ -32740,8 +33096,7 @@ index 000000000..1ea7e2baf
 +
 +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 +{
-+	return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-+			__bch2_trans_mark_dev_sb(&trans, ca));
++	return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
 +}
 +
 +/* Disk reservations: */
@@ -32915,7 +33270,7 @@ index 000000000..1ea7e2baf
 +}
 diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
 new file mode 100644
-index 000000000..6881502d9
+index 000000000000..6881502d95f1
 --- /dev/null
 +++ b/fs/bcachefs/buckets.h
 @@ -0,0 +1,300 @@
@@ -33221,7 +33576,7 @@ index 000000000..6881502d9
 +#endif /* _BUCKETS_H */
 diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
 new file mode 100644
-index 000000000..1dbba7d90
+index 000000000000..1dbba7d906dd
 --- /dev/null
 +++ b/fs/bcachefs/buckets_types.h
 @@ -0,0 +1,103 @@
@@ -33330,7 +33685,7 @@ index 000000000..1dbba7d90
 +#endif /* _BUCKETS_TYPES_H */
 diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
 new file mode 100644
-index 000000000..2e5b95508
+index 000000000000..2e5b955080de
 --- /dev/null
 +++ b/fs/bcachefs/buckets_waiting_for_journal.c
 @@ -0,0 +1,167 @@
@@ -33503,7 +33858,7 @@ index 000000000..2e5b95508
 +}
 diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
 new file mode 100644
-index 000000000..d2ae19cbe
+index 000000000000..d2ae19cbe18c
 --- /dev/null
 +++ b/fs/bcachefs/buckets_waiting_for_journal.h
 @@ -0,0 +1,15 @@
@@ -33524,7 +33879,7 @@ index 000000000..d2ae19cbe
 +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
 diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
 new file mode 100644
-index 000000000..fea7f944d
+index 000000000000..fea7f944d0ed
 --- /dev/null
 +++ b/fs/bcachefs/buckets_waiting_for_journal_types.h
 @@ -0,0 +1,23 @@
@@ -33553,7 +33908,7 @@ index 000000000..fea7f944d
 +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
 diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
 new file mode 100644
-index 000000000..dbb7e5e0b
+index 000000000000..dbb7e5e0b35b
 --- /dev/null
 +++ b/fs/bcachefs/chardev.c
 @@ -0,0 +1,760 @@
@@ -34319,7 +34674,7 @@ index 000000000..dbb7e5e0b
 +#endif /* NO_BCACHEFS_CHARDEV */
 diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
 new file mode 100644
-index 000000000..3a4890d39
+index 000000000000..3a4890d39ff9
 --- /dev/null
 +++ b/fs/bcachefs/chardev.h
 @@ -0,0 +1,31 @@
@@ -34356,13 +34711,14 @@ index 000000000..3a4890d39
 +#endif /* _BCACHEFS_CHARDEV_H */
 diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
 new file mode 100644
-index 000000000..7c2af6754
+index 000000000000..b5850a761b91
 --- /dev/null
 +++ b/fs/bcachefs/checksum.c
-@@ -0,0 +1,707 @@
+@@ -0,0 +1,712 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "checksum.h"
++#include "errcode.h"
 +#include "super.h"
 +#include "super-io.h"
 +
@@ -34889,7 +35245,7 @@ index 000000000..7c2af6754
 +
 +	ret = bch2_request_key(c->disk_sb.sb, &user_key);
 +	if (ret) {
-+		bch_err(c, "error requesting encryption key: %i", ret);
++		bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -34914,20 +35270,24 @@ index 000000000..7c2af6754
 +
 +static int bch2_alloc_ciphers(struct bch_fs *c)
 +{
++	int ret;
++
 +	if (!c->chacha20)
 +		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
-+	if (IS_ERR(c->chacha20)) {
-+		bch_err(c, "error requesting chacha20 module: %li",
-+			PTR_ERR(c->chacha20));
-+		return PTR_ERR(c->chacha20);
++	ret = PTR_ERR_OR_ZERO(c->chacha20);
++
++	if (ret) {
++		bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
++		return ret;
 +	}
 +
 +	if (!c->poly1305)
 +		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
-+	if (IS_ERR(c->poly1305)) {
-+		bch_err(c, "error requesting poly1305 module: %li",
-+			PTR_ERR(c->poly1305));
-+		return PTR_ERR(c->poly1305);
++	ret = PTR_ERR_OR_ZERO(c->poly1305);
++
++	if (ret) {
++		bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
++		return ret;
 +	}
 +
 +	return 0;
@@ -34988,7 +35348,7 @@ index 000000000..7c2af6754
 +	if (keyed) {
 +		ret = bch2_request_key(c->disk_sb.sb, &user_key);
 +		if (ret) {
-+			bch_err(c, "error requesting encryption key: %i", ret);
++			bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
 +			goto err;
 +		}
 +
@@ -35040,9 +35400,9 @@ index 000000000..7c2af6754
 +	pr_verbose_init(c->opts, "");
 +
 +	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
-+	if (IS_ERR(c->sha256)) {
-+		bch_err(c, "error requesting sha256 module");
-+		ret = PTR_ERR(c->sha256);
++	ret = PTR_ERR_OR_ZERO(c->sha256);
++	if (ret) {
++		bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
 +		goto out;
 +	}
 +
@@ -35069,7 +35429,7 @@ index 000000000..7c2af6754
 +}
 diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
 new file mode 100644
-index 000000000..c86c3c05d
+index 000000000000..c86c3c05d620
 --- /dev/null
 +++ b/fs/bcachefs/checksum.h
 @@ -0,0 +1,204 @@
@@ -35279,7 +35639,7 @@ index 000000000..c86c3c05d
 +#endif /* _BCACHEFS_CHECKSUM_H */
 diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
 new file mode 100644
-index 000000000..f3ffdbc38
+index 000000000000..f3ffdbc38485
 --- /dev/null
 +++ b/fs/bcachefs/clock.c
 @@ -0,0 +1,191 @@
@@ -35476,7 +35836,7 @@ index 000000000..f3ffdbc38
 +}
 diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
 new file mode 100644
-index 000000000..70a0f7436
+index 000000000000..70a0f7436c84
 --- /dev/null
 +++ b/fs/bcachefs/clock.h
 @@ -0,0 +1,38 @@
@@ -35520,7 +35880,7 @@ index 000000000..70a0f7436
 +#endif /* _BCACHEFS_CLOCK_H */
 diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
 new file mode 100644
-index 000000000..5fae0012d
+index 000000000000..5fae0012d808
 --- /dev/null
 +++ b/fs/bcachefs/clock_types.h
 @@ -0,0 +1,37 @@
@@ -35563,7 +35923,7 @@ index 000000000..5fae0012d
 +#endif /* _BCACHEFS_CLOCK_TYPES_H */
 diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
 new file mode 100644
-index 000000000..f692f35a6
+index 000000000000..f692f35a6a98
 --- /dev/null
 +++ b/fs/bcachefs/compress.c
 @@ -0,0 +1,639 @@
@@ -36208,7 +36568,7 @@ index 000000000..f692f35a6
 +}
 diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
 new file mode 100644
-index 000000000..4bab1f61b
+index 000000000000..4bab1f61b3b5
 --- /dev/null
 +++ b/fs/bcachefs/compress.h
 @@ -0,0 +1,18 @@
@@ -36232,7 +36592,7 @@ index 000000000..4bab1f61b
 +#endif /* _BCACHEFS_COMPRESS_H */
 diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
 new file mode 100644
-index 000000000..745f856e6
+index 000000000000..745f856e6d3e
 --- /dev/null
 +++ b/fs/bcachefs/counters.c
 @@ -0,0 +1,107 @@
@@ -36345,7 +36705,7 @@ index 000000000..745f856e6
 +};
 diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h
 new file mode 100644
-index 000000000..4778aa19b
+index 000000000000..4778aa19bf34
 --- /dev/null
 +++ b/fs/bcachefs/counters.h
 @@ -0,0 +1,17 @@
@@ -36368,7 +36728,7 @@ index 000000000..4778aa19b
 +#endif // _BCACHEFS_COUNTERS_H
 diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
 new file mode 100644
-index 000000000..519ab9b96
+index 000000000000..519ab9b96e67
 --- /dev/null
 +++ b/fs/bcachefs/darray.h
 @@ -0,0 +1,77 @@
@@ -36451,10 +36811,10 @@ index 000000000..519ab9b96
 +#endif /* _BCACHEFS_DARRAY_H */
 diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
 new file mode 100644
-index 000000000..cc9ae6dad
+index 000000000000..3b442b01ca86
 --- /dev/null
 +++ b/fs/bcachefs/data_update.c
-@@ -0,0 +1,379 @@
+@@ -0,0 +1,376 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -36480,13 +36840,13 @@ index 000000000..cc9ae6dad
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter iter, update_iter;
 +	struct bkey_s_c k;
-+	struct snapshots_seen s;
++	snapshot_id_list s;
 +	int ret;
 +
 +	if (!btree_type_has_snapshots(id))
 +		return 0;
 +
-+	snapshots_seen_init(&s);
++	darray_init(&s);
 +
 +	if (!bkey_cmp(old_pos, new_pos))
 +		return 0;
@@ -36498,7 +36858,6 @@ index 000000000..cc9ae6dad
 +			     BTREE_ITER_NOT_EXTENTS|
 +			     BTREE_ITER_ALL_SNAPSHOTS);
 +	while (1) {
-+next:
 +		k = bch2_btree_iter_prev(&iter);
 +		ret = bkey_err(k);
 +		if (ret)
@@ -36509,11 +36868,9 @@ index 000000000..cc9ae6dad
 +
 +		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
 +			struct bkey_i *update;
-+			u32 *i;
 +
-+			darray_for_each(s.ids, i)
-+				if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i))
-+					goto next;
++			if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot))
++				continue;
 +
 +			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
 +
@@ -36536,13 +36893,13 @@ index 000000000..cc9ae6dad
 +			if (ret)
 +				break;
 +
-+			ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
++			ret = snapshot_list_add(c, &s, k.k->p.snapshot);
 +			if (ret)
 +				break;
 +		}
 +	}
 +	bch2_trans_iter_exit(trans, &iter);
-+	darray_exit(&s.ids);
++	darray_exit(&s);
 +
 +	return ret;
 +}
@@ -36696,7 +37053,7 @@ index 000000000..cc9ae6dad
 +				bch2_ob_add_backpointer(c, ec_ob, &insert->k);
 +		}
 +err:
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			ret = 0;
 +		if (ret)
 +			break;
@@ -36732,7 +37089,7 @@ index 000000000..cc9ae6dad
 +	bch2_trans_exit(&trans);
 +	bch2_bkey_buf_exit(&_insert, c);
 +	bch2_bkey_buf_exit(&_new, c);
-+	BUG_ON(ret == -EINTR);
++	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 +	return ret;
 +}
 +
@@ -36836,7 +37193,7 @@ index 000000000..cc9ae6dad
 +}
 diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
 new file mode 100644
-index 000000000..e64505453
+index 000000000000..e64505453a55
 --- /dev/null
 +++ b/fs/bcachefs/data_update.h
 @@ -0,0 +1,38 @@
@@ -36880,10 +37237,10 @@ index 000000000..e64505453
 +#endif /* _BCACHEFS_DATA_UPDATE_H */
 diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
 new file mode 100644
-index 000000000..05cae0ed4
+index 000000000000..cd37a1016e25
 --- /dev/null
 +++ b/fs/bcachefs/debug.c
-@@ -0,0 +1,707 @@
+@@ -0,0 +1,764 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Assorted bcachefs debug code
@@ -37075,6 +37432,7 @@ index 000000000..05cae0ed4
 +	struct bch_fs		*c;
 +	enum btree_id		id;
 +	struct bpos		from;
++	struct bpos		prev_node;
 +	u64			iter;
 +
 +	struct printbuf		buf;
@@ -37144,39 +37502,30 @@ index 000000000..05cae0ed4
 +	i->size	= size;
 +	i->ret	= 0;
 +
-+	err = flush_buf(i);
-+	if (err)
-+		return err;
-+
-+	if (!i->size)
-+		return i->ret;
-+
 +	bch2_trans_init(&trans, i->c, 0, 0);
 +
-+	bch2_trans_iter_init(&trans, &iter, i->id, i->from,
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+	k = bch2_btree_iter_peek(&iter);
-+
-+	while (k.k && !(err = bkey_err(k))) {
-+		bch2_bkey_val_to_text(&i->buf, i->c, k);
-+		prt_char(&i->buf, '\n');
-+
-+		k = bch2_btree_iter_next(&iter);
-+		i->from = iter.pos;
-+
++	err = for_each_btree_key2(&trans, iter, i->id, i->from,
++				  BTREE_ITER_PREFETCH|
++				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 +		err = flush_buf(i);
 +		if (err)
 +			break;
 +
 +		if (!i->size)
 +			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
++
++		bch2_bkey_val_to_text(&i->buf, i->c, k);
++		prt_newline(&i->buf);
++		0;
++	}));
++	i->from = iter.pos;
++
++	if (!err)
++		err = flush_buf(i);
 +
 +	bch2_trans_exit(&trans);
 +
-+	return err < 0 ? err : i->ret;
++	return err ?: i->ret;
 +}
 +
 +static const struct file_operations btree_debug_ops = {
@@ -37246,7 +37595,6 @@ index 000000000..05cae0ed4
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	struct btree *prev_node = NULL;
 +	int err;
 +
 +	i->ubuf = buf;
@@ -37262,44 +37610,36 @@ index 000000000..05cae0ed4
 +
 +	bch2_trans_init(&trans, i->c, 0, 0);
 +
-+	bch2_trans_iter_init(&trans, &iter, i->id, i->from,
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+
-+	while ((k = bch2_btree_iter_peek(&iter)).k &&
-+	       !(err = bkey_err(k))) {
++	err = for_each_btree_key2(&trans, iter, i->id, i->from,
++				  BTREE_ITER_PREFETCH|
++				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 +		struct btree_path_level *l = &iter.path->l[0];
 +		struct bkey_packed *_k =
 +			bch2_btree_node_iter_peek(&l->iter, l->b);
 +
-+		if (l->b != prev_node) {
-+			bch2_btree_node_to_text(&i->buf, i->c, l->b);
-+			err = flush_buf(i);
-+			if (err)
-+				break;
-+		}
-+		prev_node = l->b;
-+
-+		bch2_bfloat_to_text(&i->buf, l->b, _k);
-+		err = flush_buf(i);
-+		if (err)
-+			break;
-+
-+		bch2_btree_iter_advance(&iter);
-+		i->from = iter.pos;
-+
 +		err = flush_buf(i);
 +		if (err)
 +			break;
 +
 +		if (!i->size)
 +			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
++
++		if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) {
++			bch2_btree_node_to_text(&i->buf, i->c, l->b);
++			i->prev_node = l->b->key.k.p;
++		}
++
++		bch2_bfloat_to_text(&i->buf, l->b, _k);
++		0;
++	}));
++	i->from = iter.pos;
++
++	if (!err)
++		err = flush_buf(i);
 +
 +	bch2_trans_exit(&trans);
 +
-+	return err < 0 ? err : i->ret;
++	return err ?: i->ret;
 +}
 +
 +static const struct file_operations bfloat_failed_debug_ops = {
@@ -37522,6 +37862,75 @@ index 000000000..05cae0ed4
 +	.read		= bch2_journal_pins_read,
 +};
 +
++static int lock_held_stats_open(struct inode *inode, struct file *file)
++{
++	struct bch_fs *c = inode->i_private;
++	struct dump_iter *i;
++
++	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
++
++	if (!i)
++		return -ENOMEM;
++
++	i->iter = 0;
++	i->c    = c;
++	i->buf  = PRINTBUF;
++	file->private_data = i;
++
++	return 0;
++}
++
++static int lock_held_stats_release(struct inode *inode, struct file *file)
++{
++	struct dump_iter *i = file->private_data;
++
++	printbuf_exit(&i->buf);
++	kfree(i);
++
++	return 0;
++}
++
++static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
++				      size_t size, loff_t *ppos)
++{
++	struct dump_iter        *i = file->private_data;
++	struct lock_held_stats *lhs = &i->c->lock_held_stats;
++	int err;
++
++	i->ubuf = buf;
++	i->size = size;
++	i->ret  = 0;
++
++	while (lhs->names[i->iter] != 0 && i->iter < BCH_LOCK_TIME_NR) {
++		err = flush_buf(i);
++		if (err)
++			return err;
++
++		if (!i->size)
++			break;
++
++		prt_printf(&i->buf, "%s:", lhs->names[i->iter]);
++		prt_newline(&i->buf);
++		printbuf_indent_add(&i->buf, 8);
++		bch2_time_stats_to_text(&i->buf, &lhs->times[i->iter]);
++		printbuf_indent_sub(&i->buf, 8);
++		prt_newline(&i->buf);
++		i->iter++;
++	}
++
++	if (i->buf.allocation_failure)
++		return -ENOMEM;
++
++	return i->ret;
++}
++
++static const struct file_operations lock_held_stats_op = {
++	.owner = THIS_MODULE,
++	.open = lock_held_stats_open,
++	.release = lock_held_stats_release,
++	.read = lock_held_stats_read,
++};
++
 +void bch2_fs_debug_exit(struct bch_fs *c)
 +{
 +	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
@@ -37550,6 +37959,11 @@ index 000000000..05cae0ed4
 +	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
 +			    c->btree_debug, &journal_pins_ops);
 +
++	if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
++		debugfs_create_file("lock_held_stats", 0400, c->fs_debug_dir,
++				c, &lock_held_stats_op);
++	}
++
 +	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
 +	if (IS_ERR_OR_NULL(c->btree_debug_dir))
 +		return;
@@ -37593,7 +38007,7 @@ index 000000000..05cae0ed4
 +}
 diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
 new file mode 100644
-index 000000000..0b86736e5
+index 000000000000..0b86736e5e1b
 --- /dev/null
 +++ b/fs/bcachefs/debug.h
 @@ -0,0 +1,30 @@
@@ -37629,7 +38043,7 @@ index 000000000..0b86736e5
 +#endif /* _BCACHEFS_DEBUG_H */
 diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
 new file mode 100644
-index 000000000..0cbb765cd
+index 000000000000..4d942d224a08
 --- /dev/null
 +++ b/fs/bcachefs/dirent.c
 @@ -0,0 +1,565 @@
@@ -38106,7 +38520,7 @@ index 000000000..0cbb765cd
 +
 +	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
 +					  name, inum, 0);
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +	if (!ret)
 +		bch2_trans_iter_exit(&trans, &iter);
@@ -38191,7 +38605,7 @@ index 000000000..0cbb765cd
 +	}
 +	bch2_trans_iter_exit(&trans, &iter);
 +err:
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	bch2_trans_exit(&trans);
@@ -38200,7 +38614,7 @@ index 000000000..0cbb765cd
 +}
 diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
 new file mode 100644
-index 000000000..b1466932c
+index 000000000000..b1466932c768
 --- /dev/null
 +++ b/fs/bcachefs/dirent.h
 @@ -0,0 +1,67 @@
@@ -38273,7 +38687,7 @@ index 000000000..b1466932c
 +#endif /* _BCACHEFS_DIRENT_H */
 diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
 new file mode 100644
-index 000000000..7bd441367
+index 000000000000..7bd4413671d2
 --- /dev/null
 +++ b/fs/bcachefs/disk_groups.c
 @@ -0,0 +1,506 @@
@@ -38785,7 +39199,7 @@ index 000000000..7bd441367
 +}
 diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
 new file mode 100644
-index 000000000..de9154805
+index 000000000000..de915480514b
 --- /dev/null
 +++ b/fs/bcachefs/disk_groups.h
 @@ -0,0 +1,90 @@
@@ -38881,10 +39295,10 @@ index 000000000..de9154805
 +#endif /* _BCACHEFS_DISK_GROUPS_H */
 diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
 new file mode 100644
-index 000000000..6ce352c52
+index 000000000000..f33acf1af110
 --- /dev/null
 +++ b/fs/bcachefs/ec.c
-@@ -0,0 +1,1695 @@
+@@ -0,0 +1,1673 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +/* erasure coding */
@@ -39459,18 +39873,14 @@ index 000000000..6ce352c52
 +			       struct btree_iter *iter)
 +{
 +	size_t idx = iter->pos.offset;
-+	int ret = 0;
 +
 +	if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
-+		return ret;
++		return 0;
 +
 +	bch2_trans_unlock(trans);
-+	ret = -EINTR;
 +
-+	if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
-+		return ret;
-+
-+	return -ENOMEM;
++	return   __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?:
++		bch2_trans_relock(trans);
 +}
 +
 +static ssize_t stripe_idx_to_delete(struct bch_fs *c)
@@ -39613,7 +40023,7 @@ index 000000000..6ce352c52
 +	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
 +	int ret;
 +
-+	for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos,
++	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
 +			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 +		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
 +			if (start_pos.offset) {
@@ -39627,12 +40037,13 @@ index 000000000..6ce352c52
 +		}
 +
 +		if (bkey_deleted(k.k))
-+			goto found_slot;
++			break;
 +	}
 +
-+	goto err;
-+found_slot:
-+	start_pos = iter.pos;
++	c->ec_stripe_hint = iter.pos.offset;
++
++	if (ret)
++		goto err;
 +
 +	ret = ec_stripe_mem_alloc(trans, &iter);
 +	if (ret)
@@ -39641,8 +40052,6 @@ index 000000000..6ce352c52
 +	stripe->k.p = iter.pos;
 +
 +	ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
-+
-+	c->ec_stripe_hint = start_pos.offset;
 +err:
 +	bch2_trans_iter_exit(trans, &iter);
 +
@@ -39709,80 +40118,62 @@ index 000000000..6ce352c52
 +	};
 +}
 +
-+static int ec_stripe_update_ptrs(struct bch_fs *c,
++static int ec_stripe_update_extent(struct btree_trans *trans,
++				   struct btree_iter *iter,
++				   struct bkey_s_c k,
++				   struct ec_stripe_buf *s,
++				   struct bpos end)
++{
++	const struct bch_extent_ptr *ptr_c;
++	struct bch_extent_ptr *ptr, *ec_ptr = NULL;
++	struct bkey_i *n;
++	int ret, dev, block;
++
++	if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
++		return 1;
++
++	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
++		return 0;
++
++	ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
++	/*
++	 * It doesn't generally make sense to erasure code cached ptrs:
++	 * XXX: should we be incrementing a counter?
++	 */
++	if (!ptr_c || ptr_c->cached)
++		return 0;
++
++	dev = s->key.v.ptrs[block].dev;
++
++	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(n);
++	if (ret)
++		return ret;
++
++	bkey_reassemble(n, k);
++
++	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
++	ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
++	BUG_ON(!ec_ptr);
++
++	extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block);
++
++	return bch2_trans_update(trans, iter, n, 0);
++}
++
++static int ec_stripe_update_extents(struct bch_fs *c,
 +				 struct ec_stripe_buf *s,
 +				 struct bkey *pos)
 +{
-+	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	struct bkey_s_extent e;
-+	struct bkey_buf sk;
-+	struct bpos next_pos;
-+	int ret = 0, dev, block;
 +
-+	bch2_bkey_buf_init(&sk);
-+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-+
-+	/* XXX this doesn't support the reflink btree */
-+
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-+			     bkey_start_pos(pos),
-+			     BTREE_ITER_INTENT);
-+retry:
-+	while (bch2_trans_begin(&trans),
-+	       (k = bch2_btree_iter_peek(&iter)).k &&
-+	       !(ret = bkey_err(k)) &&
-+	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
-+		const struct bch_extent_ptr *ptr_c;
-+		struct bch_extent_ptr *ptr, *ec_ptr = NULL;
-+
-+		if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
-+			bch2_btree_iter_advance(&iter);
-+			continue;
-+		}
-+
-+		ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
-+		/*
-+		 * It doesn't generally make sense to erasure code cached ptrs:
-+		 * XXX: should we be incrementing a counter?
-+		 */
-+		if (!ptr_c || ptr_c->cached) {
-+			bch2_btree_iter_advance(&iter);
-+			continue;
-+		}
-+
-+		dev = s->key.v.ptrs[block].dev;
-+
-+		bch2_bkey_buf_reassemble(&sk, c, k);
-+		e = bkey_i_to_s_extent(sk.k);
-+
-+		bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
-+		ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
-+		BUG_ON(!ec_ptr);
-+
-+		extent_stripe_ptr_add(e, s, ec_ptr, block);
-+
-+		bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
-+		next_pos = sk.k->k.p;
-+
-+		ret   = bch2_btree_iter_traverse(&iter) ?:
-+			bch2_trans_update(&trans, &iter, sk.k, 0) ?:
-+			bch2_trans_commit(&trans, NULL, NULL,
-+					BTREE_INSERT_NOFAIL);
-+		if (!ret)
-+			bch2_btree_iter_set_pos(&iter, next_pos);
-+		if (ret)
-+			break;
-+	}
-+	if (ret == -EINTR)
-+		goto retry;
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	bch2_trans_exit(&trans);
-+	bch2_bkey_buf_exit(&sk, c);
-+
-+	return ret;
++	return bch2_trans_run(c,
++		for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_extents, bkey_start_pos(pos),
++			BTREE_ITER_NOT_EXTENTS|BTREE_ITER_INTENT, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL,
++		ec_stripe_update_extent(&trans, &iter, k, s, pos->p)));
 +}
 +
 +/*
@@ -39853,9 +40244,10 @@ index 000000000..6ce352c52
 +	}
 +
 +	for_each_keylist_key(&s->keys, k) {
-+		ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
++		ret = ec_stripe_update_extents(c, &s->new_stripe, &k->k);
 +		if (ret) {
-+			bch_err(c, "error creating stripe: error %i updating pointers", ret);
++			bch_err(c, "error creating stripe: error updating pointers: %s",
++				bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -40582,7 +40974,7 @@ index 000000000..6ce352c52
 +}
 diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
 new file mode 100644
-index 000000000..a4c13d61a
+index 000000000000..a4c13d61af10
 --- /dev/null
 +++ b/fs/bcachefs/ec.h
 @@ -0,0 +1,230 @@
@@ -40818,7 +41210,7 @@ index 000000000..a4c13d61a
 +#endif /* _BCACHEFS_EC_H */
 diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
 new file mode 100644
-index 000000000..edd93da66
+index 000000000000..edd93da663c1
 --- /dev/null
 +++ b/fs/bcachefs/ec_types.h
 @@ -0,0 +1,46 @@
@@ -40868,30 +41260,139 @@ index 000000000..edd93da66
 +typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
 +
 +#endif /* _BCACHEFS_EC_TYPES_H */
+diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
+new file mode 100644
+index 000000000000..9da8a5973af0
+--- /dev/null
++++ b/fs/bcachefs/errcode.c
+@@ -0,0 +1,51 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "errcode.h"
++
++#include <linux/errname.h>
++
++static const char * const bch2_errcode_strs[] = {
++#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
++	BCH_ERRCODES()
++#undef x
++	NULL
++};
++
++#define BCH_ERR_0	0
++
++static unsigned bch2_errcode_parents[] = {
++#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = BCH_ERR_##class,
++	BCH_ERRCODES()
++#undef x
++};
++
++const char *bch2_err_str(int err)
++{
++	const char *errstr;
++	err = abs(err);
++
++	BUG_ON(err >= BCH_ERR_MAX);
++
++	if (err >= BCH_ERR_START)
++		errstr = bch2_errcode_strs[err - BCH_ERR_START];
++	else if (err)
++		errstr = errname(err);
++	else
++		errstr = "(No error)";
++	return errstr ?: "(Invalid error)";
++}
++
++bool __bch2_err_matches(int err, int class)
++{
++	err	= abs(err);
++	class	= abs(class);
++
++	BUG_ON(err	>= BCH_ERR_MAX);
++	BUG_ON(class	>= BCH_ERR_MAX);
++
++	while (err >= BCH_ERR_START && err != class)
++		err = bch2_errcode_parents[err - BCH_ERR_START];
++
++	return err == class;
++}
 diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
 new file mode 100644
-index 000000000..f7d12915c
+index 000000000000..95925c8434b3
 --- /dev/null
 +++ b/fs/bcachefs/errcode.h
-@@ -0,0 +1,12 @@
+@@ -0,0 +1,64 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ERRCODE_H
 +#define _BCACHEFS_ERRCODE_H
 +
-+enum {
-+	/* Bucket allocator: */
-+	OPEN_BUCKETS_EMPTY =	2048,
-+	FREELIST_EMPTY,		/* Allocator thread not keeping up */
-+	INSUFFICIENT_DEVICES,
++#define BCH_ERRCODES()							\
++	x(0,			open_buckets_empty)			\
++	x(0,			freelist_empty)				\
++	x(freelist_empty,	no_buckets_found)			\
++	x(0,			insufficient_devices)			\
++	x(0,			transaction_restart)			\
++	x(transaction_restart,	transaction_restart_fault_inject)	\
++	x(transaction_restart,	transaction_restart_relock)		\
++	x(transaction_restart,	transaction_restart_relock_path)	\
++	x(transaction_restart,	transaction_restart_relock_path_intent)	\
++	x(transaction_restart,	transaction_restart_relock_after_fill)	\
++	x(transaction_restart,	transaction_restart_too_many_iters)	\
++	x(transaction_restart,	transaction_restart_lock_node_reused)	\
++	x(transaction_restart,	transaction_restart_fill_relock)	\
++	x(transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
++	x(transaction_restart,	transaction_restart_mem_realloced)	\
++	x(transaction_restart,	transaction_restart_in_traverse_all)	\
++	x(transaction_restart,	transaction_restart_would_deadlock)	\
++	x(transaction_restart,	transaction_restart_would_deadlock_write)\
++	x(transaction_restart,	transaction_restart_upgrade)		\
++	x(transaction_restart,	transaction_restart_key_cache_fill)	\
++	x(transaction_restart,	transaction_restart_key_cache_raced)	\
++	x(transaction_restart,	transaction_restart_key_cache_realloced)\
++	x(transaction_restart,	transaction_restart_journal_preres_get)	\
++	x(transaction_restart,	transaction_restart_nested)		\
++	x(0,			lock_fail_node_reused)			\
++	x(0,			lock_fail_root_changed)			\
++	x(0,			journal_reclaim_would_deadlock)		\
++	x(0,			fsck)					\
++	x(fsck,			fsck_fix)				\
++	x(fsck,			fsck_ignore)				\
++	x(fsck,			fsck_errors_not_fixed)			\
++	x(fsck,			fsck_repair_unimplemented)		\
++	x(fsck,			fsck_repair_impossible)			\
++	x(0,			need_snapshot_cleanup)			\
++	x(0,			need_topology_repair)
++
++enum bch_errcode {
++	BCH_ERR_START		= 2048,
++#define x(class, err) BCH_ERR_##err,
++	BCH_ERRCODES()
++#undef x
++	BCH_ERR_MAX
 +};
 +
++const char *bch2_err_str(int);
++bool __bch2_err_matches(int, int);
++
++static inline bool _bch2_err_matches(int err, int class)
++{
++	return err && __bch2_err_matches(err, class);
++}
++
++#define bch2_err_matches(_err, _class)			\
++({							\
++	BUILD_BUG_ON(!__builtin_constant_p(_class));	\
++	_bch2_err_matches(_err, _class);		\
++})
++
 +#endif /* _BCACHFES_ERRCODE_H */
 diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
 new file mode 100644
-index 000000000..8279a9ba7
+index 000000000000..f6a895b2ceb7
 --- /dev/null
 +++ b/fs/bcachefs/error.c
-@@ -0,0 +1,185 @@
+@@ -0,0 +1,184 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "error.h"
@@ -40962,8 +41463,7 @@ index 000000000..8279a9ba7
 +#include "tools-util.h"
 +#endif
 +
-+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
-+				const char *fmt, ...)
++int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 +{
 +	struct fsck_err_state *s = NULL;
 +	va_list args;
@@ -40977,10 +41477,10 @@ index 000000000..8279a9ba7
 +
 +		if (c->opts.errors == BCH_ON_ERROR_continue) {
 +			bch_err(c, "fixing");
-+			return FSCK_ERR_FIX;
++			return -BCH_ERR_fsck_fix;
 +		} else {
 +			bch2_inconsistent_error(c);
-+			return FSCK_ERR_EXIT;
++			return -BCH_ERR_fsck_errors_not_fixed;
 +		}
 +	}
 +
@@ -41050,14 +41550,14 @@ index 000000000..8279a9ba7
 +
 +	if (fix) {
 +		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
-+		return FSCK_ERR_FIX;
++		return -BCH_ERR_fsck_fix;
 +	} else {
 +		set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
 +		set_bit(BCH_FS_ERROR, &c->flags);
 +		return c->opts.fix_errors == FSCK_OPT_EXIT ||
 +			!(flags & FSCK_CAN_IGNORE)
-+			? FSCK_ERR_EXIT
-+			: FSCK_ERR_IGNORE;
++			? -BCH_ERR_fsck_errors_not_fixed
++			: -BCH_ERR_fsck_ignore;
 +	}
 +}
 +
@@ -41079,10 +41579,10 @@ index 000000000..8279a9ba7
 +}
 diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
 new file mode 100644
-index 000000000..6e63c3818
+index 000000000000..b603d738c549
 --- /dev/null
 +++ b/fs/bcachefs/error.h
-@@ -0,0 +1,238 @@
+@@ -0,0 +1,223 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ERROR_H
 +#define _BCACHEFS_ERROR_H
@@ -41176,14 +41676,6 @@ index 000000000..6e63c3818
 + * be able to repair:
 + */
 +
-+enum {
-+	BCH_FSCK_OK			= 0,
-+	BCH_FSCK_ERRORS_NOT_FIXED	= 1,
-+	BCH_FSCK_REPAIR_UNIMPLEMENTED	= 2,
-+	BCH_FSCK_REPAIR_IMPOSSIBLE	= 3,
-+	BCH_FSCK_UNKNOWN_VERSION	= 4,
-+};
-+
 +enum fsck_err_opts {
 +	FSCK_OPT_EXIT,
 +	FSCK_OPT_YES,
@@ -41191,13 +41683,6 @@ index 000000000..6e63c3818
 +	FSCK_OPT_ASK,
 +};
 +
-+enum fsck_err_ret {
-+	FSCK_ERR_IGNORE	= 0,
-+	FSCK_ERR_FIX	= 1,
-+	FSCK_ERR_EXIT	= 2,
-+	FSCK_ERR_START_TOPOLOGY_REPAIR = 3,
-+};
-+
 +struct fsck_err_state {
 +	struct list_head	list;
 +	const char		*fmt;
@@ -41212,21 +41697,21 @@ index 000000000..6e63c3818
 +#define FSCK_NO_RATELIMIT	(1 << 3)
 +
 +__printf(3, 4) __cold
-+enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
-+				unsigned, const char *, ...);
++int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
 +void bch2_flush_fsck_errs(struct bch_fs *);
 +
 +#define __fsck_err(c, _flags, msg, ...)					\
 +({									\
-+	int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
++	int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);	\
 +									\
-+	if (_fix == FSCK_ERR_EXIT) {					\
++	if (_ret != -BCH_ERR_fsck_fix &&				\
++	    _ret != -BCH_ERR_fsck_ignore) {				\
 +		bch_err(c, "Unable to continue, halting");		\
-+		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
++		ret = _ret;						\
 +		goto fsck_err;						\
 +	}								\
 +									\
-+	_fix;								\
++	_ret == -BCH_ERR_fsck_fix;					\
 +})
 +
 +/* These macros return true if error should be fixed: */
@@ -41323,7 +41808,7 @@ index 000000000..6e63c3818
 +#endif /* _BCACHEFS_ERROR_H */
 diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
 new file mode 100644
-index 000000000..2fd5d9672
+index 000000000000..2fd5d9672a44
 --- /dev/null
 +++ b/fs/bcachefs/extent_update.c
 @@ -0,0 +1,178 @@
@@ -41507,7 +41992,7 @@ index 000000000..2fd5d9672
 +}
 diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
 new file mode 100644
-index 000000000..6f5cf4493
+index 000000000000..6f5cf449361a
 --- /dev/null
 +++ b/fs/bcachefs/extent_update.h
 @@ -0,0 +1,12 @@
@@ -41525,7 +42010,7 @@ index 000000000..6f5cf4493
 +#endif /* _BCACHEFS_EXTENT_UPDATE_H */
 diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
 new file mode 100644
-index 000000000..2ca13014b
+index 000000000000..2ca13014b9c4
 --- /dev/null
 +++ b/fs/bcachefs/extents.c
 @@ -0,0 +1,1324 @@
@@ -42855,7 +43340,7 @@ index 000000000..2ca13014b
 +}
 diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
 new file mode 100644
-index 000000000..3c17b8113
+index 000000000000..3c17b81130bb
 --- /dev/null
 +++ b/fs/bcachefs/extents.h
 @@ -0,0 +1,685 @@
@@ -43546,7 +44031,7 @@ index 000000000..3c17b8113
 +#endif /* _BCACHEFS_EXTENTS_H */
 diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
 new file mode 100644
-index 000000000..43d6c341e
+index 000000000000..43d6c341ecca
 --- /dev/null
 +++ b/fs/bcachefs/extents_types.h
 @@ -0,0 +1,40 @@
@@ -43592,7 +44077,7 @@ index 000000000..43d6c341e
 +#endif /* _BCACHEFS_EXTENTS_TYPES_H */
 diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
 new file mode 100644
-index 000000000..05429c963
+index 000000000000..05429c9631cd
 --- /dev/null
 +++ b/fs/bcachefs/eytzinger.h
 @@ -0,0 +1,281 @@
@@ -43879,7 +44364,7 @@ index 000000000..05429c963
 +#endif /* _EYTZINGER_H */
 diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
 new file mode 100644
-index 000000000..cdb272708
+index 000000000000..cdb272708a4b
 --- /dev/null
 +++ b/fs/bcachefs/fifo.h
 @@ -0,0 +1,127 @@
@@ -44012,7 +44497,7 @@ index 000000000..cdb272708
 +#endif /* _BCACHEFS_FIFO_H */
 diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
 new file mode 100644
-index 000000000..53ffc6842
+index 000000000000..53ffc684223c
 --- /dev/null
 +++ b/fs/bcachefs/fs-common.c
 @@ -0,0 +1,496 @@
@@ -44514,7 +44999,7 @@ index 000000000..53ffc6842
 +}
 diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
 new file mode 100644
-index 000000000..dde237859
+index 000000000000..dde237859514
 --- /dev/null
 +++ b/fs/bcachefs/fs-common.h
 @@ -0,0 +1,43 @@
@@ -44563,7 +45048,7 @@ index 000000000..dde237859
 +#endif /* _BCACHEFS_FS_COMMON_H */
 diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
 new file mode 100644
-index 000000000..bcfd9e5f3
+index 000000000000..f37bc43e27f4
 --- /dev/null
 +++ b/fs/bcachefs/fs-io.c
 @@ -0,0 +1,3496 @@
@@ -44978,7 +45463,7 @@ index 000000000..bcfd9e5f3
 +	offset = iter.pos.offset;
 +	bch2_trans_iter_exit(&trans, &iter);
 +err:
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +	bch2_trans_exit(&trans);
 +
@@ -45614,10 +46099,9 @@ index 000000000..bcfd9e5f3
 +		 * read_extent -> io_time_reset may cause a transaction restart
 +		 * without returning an error, we need to check for that here:
 +		 */
-+		if (!bch2_trans_relock(trans)) {
-+			ret = -EINTR;
++		ret = bch2_trans_relock(trans);
++		if (ret)
 +			break;
-+		}
 +
 +		bch2_btree_iter_set_pos(&iter,
 +				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@ -45670,7 +46154,7 @@ index 000000000..bcfd9e5f3
 +err:
 +	bch2_trans_iter_exit(trans, &iter);
 +
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	if (ret) {
@@ -46642,7 +47126,7 @@ index 000000000..bcfd9e5f3
 +	offset = iter.pos.offset;
 +	bch2_trans_iter_exit(&trans, &iter);
 +err:
-+	if (err == -EINTR)
++	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
 +		goto retry;
 +	bch2_trans_exit(&trans);
 +
@@ -47018,7 +47502,7 @@ index 000000000..bcfd9e5f3
 +	start = iter.pos;
 +	bch2_trans_iter_exit(&trans, &iter);
 +err:
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	bch2_trans_exit(&trans);
@@ -47408,7 +47892,8 @@ index 000000000..bcfd9e5f3
 +	bch2_trans_copy_iter(&dst, &src);
 +	bch2_trans_copy_iter(&del, &src);
 +
-+	while (ret == 0 || ret == -EINTR) {
++	while (ret == 0 ||
++	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 +		struct disk_reservation disk_res =
 +			bch2_disk_reservation_init(c, 0);
 +		struct bkey_i delete;
@@ -47610,7 +48095,7 @@ index 000000000..bcfd9e5f3
 +bkey_err:
 +		bch2_quota_reservation_put(c, inode, &quota_res);
 +		bch2_disk_reservation_put(c, &disk_res);
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			ret = 0;
 +	}
 +
@@ -47890,7 +48375,7 @@ index 000000000..bcfd9e5f3
 +	}
 +	bch2_trans_iter_exit(&trans, &iter);
 +err:
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	bch2_trans_exit(&trans);
@@ -48005,7 +48490,7 @@ index 000000000..bcfd9e5f3
 +	}
 +	bch2_trans_iter_exit(&trans, &iter);
 +err:
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	bch2_trans_exit(&trans);
@@ -48065,7 +48550,7 @@ index 000000000..bcfd9e5f3
 +#endif /* NO_BCACHEFS_FS */
 diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
 new file mode 100644
-index 000000000..7f2d7f454
+index 000000000000..7f2d7f454be4
 --- /dev/null
 +++ b/fs/bcachefs/fs-io.h
 @@ -0,0 +1,56 @@
@@ -48127,7 +48612,7 @@ index 000000000..7f2d7f454
 +#endif /* _BCACHEFS_FS_IO_H */
 diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
 new file mode 100644
-index 000000000..9f329a624
+index 000000000000..9f329a624c12
 --- /dev/null
 +++ b/fs/bcachefs/fs-ioctl.c
 @@ -0,0 +1,523 @@
@@ -48656,7 +49141,7 @@ index 000000000..9f329a624
 +#endif /* NO_BCACHEFS_FS */
 diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
 new file mode 100644
-index 000000000..f201980ef
+index 000000000000..f201980ef2c3
 --- /dev/null
 +++ b/fs/bcachefs/fs-ioctl.h
 @@ -0,0 +1,81 @@
@@ -48743,7 +49228,7 @@ index 000000000..f201980ef
 +#endif /* _BCACHEFS_FS_IOCTL_H */
 diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
 new file mode 100644
-index 000000000..bb94ba58a
+index 000000000000..6d57bd87bfd5
 --- /dev/null
 +++ b/fs/bcachefs/fs.c
 @@ -0,0 +1,1939 @@
@@ -48757,6 +49242,7 @@ index 000000000..bb94ba58a
 +#include "buckets.h"
 +#include "chardev.h"
 +#include "dirent.h"
++#include "errcode.h"
 +#include "extents.h"
 +#include "fs.h"
 +#include "fs-common.h"
@@ -48902,7 +49388,7 @@ index 000000000..bb94ba58a
 +
 +	bch2_trans_iter_exit(&trans, &iter);
 +
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	bch2_trans_exit(&trans);
@@ -49072,7 +49558,7 @@ index 000000000..bb94ba58a
 +		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
 +				KEY_TYPE_QUOTA_WARN);
 +err_before_quota:
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			goto retry;
 +		goto err_trans;
 +	}
@@ -49192,7 +49678,7 @@ index 000000000..bb94ba58a
 +	mutex_lock(&inode->ei_update_lock);
 +	bch2_trans_init(&trans, c, 4, 1024);
 +
-+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++	ret = commit_do(&trans, NULL, NULL, 0,
 +			bch2_link_trans(&trans,
 +					inode_inum(dir),   &dir_u,
 +					inode_inum(inode), &inode_u,
@@ -49241,7 +49727,7 @@ index 000000000..bb94ba58a
 +	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
 +	bch2_trans_init(&trans, c, 4, 1024);
 +
-+	ret = __bch2_trans_do(&trans, NULL, NULL,
++	ret = commit_do(&trans, NULL, NULL,
 +			      BTREE_INSERT_NOFAIL,
 +			bch2_unlink_trans(&trans,
 +					  inode_inum(dir), &dir_u,
@@ -49363,7 +49849,7 @@ index 000000000..bb94ba58a
 +			goto err;
 +	}
 +
-+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++	ret = commit_do(&trans, NULL, NULL, 0,
 +			bch2_rename_trans(&trans,
 +					  inode_inum(src_dir), &src_dir_u,
 +					  inode_inum(dst_dir), &dst_dir_u,
@@ -49503,7 +49989,7 @@ index 000000000..bb94ba58a
 +btree_err:
 +	bch2_trans_iter_exit(&trans, &inode_iter);
 +
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +	if (unlikely(ret))
 +		goto err_trans;
@@ -49734,7 +50220,7 @@ index 000000000..bb94ba58a
 +	start = iter.pos.offset;
 +	bch2_trans_iter_exit(&trans, &iter);
 +err:
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	if (!ret && have_extent)
@@ -50084,7 +50570,7 @@ index 000000000..bb94ba58a
 +	memcpy(name, d.v->d_name, name_len);
 +	name[name_len] = '\0';
 +err:
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	bch2_trans_iter_exit(&trans, &iter1);
@@ -50619,10 +51105,9 @@ index 000000000..bb94ba58a
 +	sb->s_shrink.seeks = 0;
 +
 +	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
-+	if (IS_ERR(vinode)) {
-+		bch_err(c, "error mounting: error getting root inode %i",
-+			(int) PTR_ERR(vinode));
-+		ret = PTR_ERR(vinode);
++	ret = PTR_ERR_OR_ZERO(vinode);
++	if (ret) {
++		bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
 +		goto err_put_super;
 +	}
 +
@@ -50688,7 +51173,7 @@ index 000000000..bb94ba58a
 +#endif /* NO_BCACHEFS_FS */
 diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
 new file mode 100644
-index 000000000..9f4b57e30
+index 000000000000..9f4b57e30e2a
 --- /dev/null
 +++ b/fs/bcachefs/fs.h
 @@ -0,0 +1,208 @@
@@ -50902,10 +51387,10 @@ index 000000000..9f4b57e30
 +#endif /* _BCACHEFS_FS_H */
 diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
 new file mode 100644
-index 000000000..81bfd6ea2
+index 000000000000..bb8cab7cb405
 --- /dev/null
 +++ b/fs/bcachefs/fsck.c
-@@ -0,0 +1,2413 @@
+@@ -0,0 +1,2390 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -51044,9 +51529,9 @@ index 000000000..81bfd6ea2
 +
 +	ret = bch2_inode_unpack(k, inode);
 +err:
-+	if (ret && ret != -EINTR)
-+		bch_err(trans->c, "error %i fetching inode %llu",
-+			ret, inode_nr);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(trans->c, "error fetching inode %llu: %s",
++			inode_nr, bch2_err_str(ret));
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
@@ -51072,9 +51557,9 @@ index 000000000..81bfd6ea2
 +	if (!ret)
 +		*snapshot = iter.pos.snapshot;
 +err:
-+	if (ret && ret != -EINTR)
-+		bch_err(trans->c, "error %i fetching inode %llu:%u",
-+			ret, inode_nr, *snapshot);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(trans->c, "error fetching inode %llu:%u: %s",
++			inode_nr, *snapshot, bch2_err_str(ret));
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
@@ -51128,17 +51613,19 @@ index 000000000..81bfd6ea2
 +		       struct bch_inode_unpacked *inode,
 +		       u32 snapshot)
 +{
-+	int ret = __bch2_trans_do(trans, NULL, NULL,
++	int ret = commit_do(trans, NULL, NULL,
 +				  BTREE_INSERT_NOFAIL|
 +				  BTREE_INSERT_LAZY_RW,
 +				  __write_inode(trans, inode, snapshot));
 +	if (ret)
-+		bch_err(trans->c, "error in fsck: error %i updating inode", ret);
++		bch_err(trans->c, "error in fsck: error updating inode: %s",
++			bch2_err_str(ret));
 +	return ret;
 +}
 +
 +static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
 +{
++	struct bch_fs *c = trans->c;
 +	struct btree_iter iter = { NULL };
 +	struct bkey_i_inode_generation delete;
 +	struct bch_inode_unpacked inode_u;
@@ -51171,7 +51658,7 @@ index 000000000..81bfd6ea2
 +		goto err;
 +
 +	if (!bkey_is_inode(k.k)) {
-+		bch2_fs_inconsistent(trans->c,
++		bch2_fs_inconsistent(c,
 +				     "inode %llu:%u not found when deleting",
 +				     inum, snapshot);
 +		ret = -EIO;
@@ -51181,11 +51668,8 @@ index 000000000..81bfd6ea2
 +	bch2_inode_unpack(k, &inode_u);
 +
 +	/* Subvolume root? */
-+	if (inode_u.bi_subvol) {
-+		ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
-+		if (ret)
-+			goto err;
-+	}
++	if (inode_u.bi_subvol)
++		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
 +
 +	bkey_inode_generation_init(&delete.k_i);
 +	delete.k.p = iter.pos;
@@ -51196,7 +51680,7 @@ index 000000000..81bfd6ea2
 +				BTREE_INSERT_NOFAIL);
 +err:
 +	bch2_trans_iter_exit(trans, &iter);
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	return ret;
@@ -51223,8 +51707,8 @@ index 000000000..81bfd6ea2
 +				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 +	bch2_trans_iter_exit(trans, &iter);
 +err:
-+	if (ret && ret != -EINTR)
-+		bch_err(c, "error %i from __remove_dirent()", ret);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -51259,8 +51743,8 @@ index 000000000..81bfd6ea2
 +		goto create_lostfound;
 +	}
 +
-+	if (ret && ret != -EINTR)
-+		bch_err(c, "error looking up lost+found: %i", ret);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
 +	if (ret)
 +		return ret;
 +
@@ -51282,8 +51766,8 @@ index 000000000..81bfd6ea2
 +				lostfound, &lostfound_str,
 +				0, 0, S_IFDIR|0700, 0, NULL, NULL,
 +				(subvol_inum) { }, 0);
-+	if (ret && ret != -EINTR)
-+		bch_err(c, "error creating lost+found: %i", ret);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -51342,13 +51826,13 @@ index 000000000..81bfd6ea2
 +			  struct bch_inode_unpacked *inode,
 +			  u32 inode_snapshot)
 +{
-+	int ret = __bch2_trans_do(trans, NULL, NULL,
++	int ret = commit_do(trans, NULL, NULL,
 +				  BTREE_INSERT_LAZY_RW|
 +				  BTREE_INSERT_NOFAIL,
 +			__reattach_inode(trans, inode, inode_snapshot));
 +	if (ret) {
-+		bch_err(trans->c, "error %i reattaching inode %llu",
-+			ret, inode->bi_inum);
++		bch_err(trans->c, "error reattaching inode %llu: %s",
++			inode->bi_inum, bch2_err_str(ret));
 +		return ret;
 +	}
 +
@@ -51379,19 +51863,82 @@ index 000000000..81bfd6ea2
 +	return ret;
 +}
 +
-+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
++struct snapshots_seen_entry {
++	u32				id;
++	u32				equiv;
++};
++
++struct snapshots_seen {
++	struct bpos			pos;
++	DARRAY(struct snapshots_seen_entry) ids;
++};
++
++static inline void snapshots_seen_exit(struct snapshots_seen *s)
 +{
-+	pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
++	darray_exit(&s->ids);
++}
++
++static inline void snapshots_seen_init(struct snapshots_seen *s)
++{
++	memset(s, 0, sizeof(*s));
++}
++
++static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
++{
++	struct snapshots_seen_entry *i, n = { id, id };
++	int ret;
++
++	darray_for_each(s->ids, i) {
++		if (n.equiv < i->equiv)
++			break;
++
++		if (i->equiv == n.equiv) {
++			bch_err(c, "adding duplicate snapshot in snapshots_seen_add()");
++			return -EINVAL;
++		}
++	}
++
++	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
++	if (ret)
++		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
++			s->ids.size);
++	return ret;
++}
++
++static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
++				 enum btree_id btree_id, struct bpos pos)
++{
++	struct snapshots_seen_entry *i, n = {
++		.id	= pos.snapshot,
++		.equiv	= bch2_snapshot_equiv(c, pos.snapshot),
++	};
++	int ret;
 +
 +	if (bkey_cmp(s->pos, pos))
 +		s->ids.nr = 0;
++
++	pos.snapshot = n.equiv;
 +	s->pos = pos;
 +
-+	/* Might get called multiple times due to lock restarts */
-+	if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot)
-+		return 0;
++	darray_for_each(s->ids, i)
++		if (i->equiv == n.equiv) {
++			if (i->id != n.id) {
++				bch_err(c, "snapshot deletion did not run correctly:\n"
++					"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
++					bch2_btree_ids[btree_id],
++					pos.inode, pos.offset,
++					i->id, n.id, n.equiv);
++				return -BCH_ERR_need_snapshot_cleanup;
++			}
 +
-+	return snapshots_seen_add(c, s, pos.snapshot);
++			return 0;
++		}
++
++	ret = darray_push(&s->ids, n);
++	if (ret)
++		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
++			s->ids.size);
++	return ret;
 +}
 +
 +/**
@@ -51404,15 +51951,15 @@ index 000000000..81bfd6ea2
 +				    u32 id, u32 ancestor)
 +{
 +	ssize_t i;
++	u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0;
 +
 +	BUG_ON(id > ancestor);
-+
-+	id		= snapshot_t(c, id)->equiv;
-+	ancestor	= snapshot_t(c, ancestor)->equiv;
++	BUG_ON(!bch2_snapshot_is_equiv(c, id));
++	BUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
 +
 +	/* @ancestor should be the snapshot most recently added to @seen */
-+	BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor);
-+	BUG_ON(seen->pos.snapshot != ancestor);
++	BUG_ON(ancestor != seen->pos.snapshot);
++	BUG_ON(ancestor != top);
 +
 +	if (id == ancestor)
 +		return true;
@@ -51421,10 +51968,10 @@ index 000000000..81bfd6ea2
 +		return false;
 +
 +	for (i = seen->ids.nr - 2;
-+	     i >= 0 && seen->ids.data[i] >= id;
++	     i >= 0 && seen->ids.data[i].equiv >= id;
 +	     --i)
-+		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) &&
-+		    bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor))
++		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) &&
++		    bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor))
 +			return false;
 +
 +	return true;
@@ -51449,8 +51996,9 @@ index 000000000..81bfd6ea2
 +		: bch2_snapshot_is_ancestor(c, src, dst);
 +}
 +
-+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)	\
-+	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\
++#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
++	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
++	     (_i)->snapshot <= (_snapshot); _i++)					\
 +		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
 +
 +struct inode_walker_entry {
@@ -51485,7 +52033,7 @@ index 000000000..81bfd6ea2
 +
 +	return darray_push(&w->inodes, ((struct inode_walker_entry) {
 +		.inode		= u,
-+		.snapshot	= snapshot_t(c, inode.k->p.snapshot)->equiv,
++		.snapshot	= bch2_snapshot_equiv(c, inode.k->p.snapshot),
 +	}));
 +}
 +
@@ -51495,10 +52043,11 @@ index 000000000..81bfd6ea2
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	unsigned i, ancestor_pos;
++	u32 restart_count = trans->restart_count;
++	unsigned i;
 +	int ret;
 +
-+	pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
++	pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot);
 +
 +	if (pos.inode == w->cur_inum) {
 +		w->first_this_inode = false;
@@ -51522,6 +52071,10 @@ index 000000000..81bfd6ea2
 +
 +	w->cur_inum		= pos.inode;
 +	w->first_this_inode	= true;
++
++	if (trans_was_restarted(trans, restart_count))
++		return -BCH_ERR_transaction_restart_nested;
++
 +lookup_snapshot:
 +	for (i = 0; i < w->inodes.nr; i++)
 +		if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
@@ -51531,17 +52084,20 @@ index 000000000..81bfd6ea2
 +	BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
 +
 +	if (pos.snapshot != w->inodes.data[i].snapshot) {
-+		ancestor_pos = i;
++		struct inode_walker_entry e = w->inodes.data[i];
++
++		e.snapshot = pos.snapshot;
++		e.count = 0;
++
++		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
++			 pos.inode, pos.snapshot, w->inodes.data[i].snapshot);
 +
 +		while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
 +			--i;
 +
-+		ret = darray_insert_item(&w->inodes, i, w->inodes.data[ancestor_pos]);
++		ret = darray_insert_item(&w->inodes, i, e);
 +		if (ret)
 +			return ret;
-+
-+		w->inodes.data[i].snapshot = pos.snapshot;
-+		w->inodes.data[i].count	= 0;
 +	}
 +
 +	return i;
@@ -51561,17 +52117,19 @@ index 000000000..81bfd6ea2
 +
 +	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
 +			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++		u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
++
 +		if (k.k->p.offset != inum)
 +			break;
 +
-+		if (!bkey_is_inode(k.k))
++		if (!ref_visible(c, s, s->pos.snapshot, equiv))
 +			continue;
 +
-+		if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
++		if (bkey_is_inode(k.k))
 +			add_inode(c, w, k);
-+			if (k.k->p.snapshot >= s->pos.snapshot)
-+				break;
-+		}
++
++		if (equiv >= s->pos.snapshot)
++			break;
 +	}
 +	bch2_trans_iter_exit(trans, &iter);
 +
@@ -51586,7 +52144,7 @@ index 000000000..81bfd6ea2
 +	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
-+	if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
++	if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
 +			"key in missing snapshot: %s",
 +			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 +		ret = bch2_btree_delete_at(trans, iter,
@@ -51678,44 +52236,40 @@ index 000000000..81bfd6ea2
 +		     "hashed to %llu\n%s",
 +		     bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
 +		     (printbuf_reset(&buf),
-+		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
-+		return 0;
-+
-+	ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
-+	if (ret) {
-+		bch_err(c, "hash_redo_key err %i", ret);
-+		return ret;
++		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
++		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
++		if (ret) {
++			bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
++			return ret;
++		}
++		ret = -BCH_ERR_transaction_restart_nested;
 +	}
-+	ret = -EINTR;
 +fsck_err:
 +	goto out;
 +}
 +
 +static int check_inode(struct btree_trans *trans,
 +		       struct btree_iter *iter,
++		       struct bkey_s_c k,
 +		       struct bch_inode_unpacked *prev,
++		       struct snapshots_seen *s,
 +		       bool full)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bkey_s_c k;
 +	struct bch_inode_unpacked u;
 +	bool do_update = false;
 +	int ret;
 +
-+	k = bch2_btree_iter_peek(iter);
-+	if (!k.k)
-+		return 0;
-+
-+	ret = bkey_err(k);
-+	if (ret)
-+		return ret;
-+
 +	ret = check_key_has_snapshot(trans, iter, k);
 +	if (ret < 0)
 +		goto err;
 +	if (ret)
 +		return 0;
 +
++	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
++	if (ret)
++		goto err;
++
 +	/*
 +	 * if snapshot id isn't a leaf node, skip it - deletion in
 +	 * particular is not atomic, so on the internal snapshot nodes
@@ -51754,7 +52308,8 @@ index 000000000..81bfd6ea2
 +
 +		ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
 +		if (ret)
-+			bch_err(c, "error in fsck: error %i while deleting inode", ret);
++			bch_err(c, "error in fsck: error while deleting inode: %s",
++				bch2_err_str(ret));
 +		return ret;
 +	}
 +
@@ -51777,7 +52332,8 @@ index 000000000..81bfd6ea2
 +				POS(u.bi_inum, U64_MAX),
 +				0, NULL);
 +		if (ret) {
-+			bch_err(c, "error in fsck: error %i truncating inode", ret);
++			bch_err(c, "error in fsck: error truncating inode: %s",
++				bch2_err_str(ret));
 +			return ret;
 +		}
 +
@@ -51802,8 +52358,8 @@ index 000000000..81bfd6ea2
 +
 +		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
 +		if (sectors < 0) {
-+			bch_err(c, "error in fsck: error %i recounting inode sectors",
-+				(int) sectors);
++			bch_err(c, "error in fsck: error recounting inode sectors: %s",
++				bch2_err_str(sectors));
 +			return sectors;
 +		}
 +
@@ -51820,15 +52376,15 @@ index 000000000..81bfd6ea2
 +	}
 +
 +	if (do_update) {
-+		ret = write_inode(trans, &u, iter->pos.snapshot);
++		ret = __write_inode(trans, &u, iter->pos.snapshot);
 +		if (ret)
-+			bch_err(c, "error in fsck: error %i "
-+				"updating inode", ret);
++			bch_err(c, "error in fsck: error updating inode: %s",
++				bch2_err_str(ret));
 +	}
 +err:
 +fsck_err:
 +	if (ret)
-+		bch_err(c, "error %i from check_inode()", ret);
++		bch_err(c, "error from check_inode(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -51838,90 +52394,66 @@ index 000000000..81bfd6ea2
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bch_inode_unpacked prev = { 0 };
++	struct snapshots_seen s;
++	struct bkey_s_c k;
 +	int ret;
 +
++	snapshots_seen_init(&s);
 +	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 +
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
-+			     BTREE_ITER_INTENT|
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+
-+	do {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_LAZY_RW|
-+				      BTREE_INSERT_NOFAIL,
-+			check_inode(&trans, &iter, &prev, full));
-+		if (ret)
-+			break;
-+	} while (bch2_btree_iter_advance(&iter));
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
++			POS_MIN,
++			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_inode(&trans, &iter, k, &prev, &s, full));
 +
 +	bch2_trans_exit(&trans);
++	snapshots_seen_exit(&s);
 +	if (ret)
-+		bch_err(c, "error %i from check_inodes()", ret);
++		bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
-+static int check_subvol(struct btree_trans *trans,
-+			struct btree_iter *iter)
++/*
++ * Checking for overlapping extents needs to be reimplemented
++ */
++#if 0
++static int fix_overlapping_extent(struct btree_trans *trans,
++				       struct bkey_s_c k, struct bpos cut_at)
 +{
-+	struct bkey_s_c k;
-+	struct bkey_s_c_subvolume subvol;
++	struct btree_iter iter;
++	struct bkey_i *u;
 +	int ret;
 +
-+	k = bch2_btree_iter_peek(iter);
-+	if (!k.k)
-+		return 0;
-+
-+	ret = bkey_err(k);
++	u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(u);
 +	if (ret)
 +		return ret;
 +
-+	if (k.k->type != KEY_TYPE_subvolume)
-+		return 0;
++	bkey_reassemble(u, k);
++	bch2_cut_front(cut_at, u);
 +
-+	subvol = bkey_s_c_to_subvolume(k);
 +
-+	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
-+		ret = bch2_subvolume_delete(trans, iter->pos.offset);
-+		if (ret && ret != -EINTR)
-+			bch_err(trans->c, "error deleting subvolume %llu: %i",
-+				iter->pos.offset, ret);
-+		if (ret)
-+			return ret;
-+	}
++	/*
++	 * We don't want to go through the extent_handle_overwrites path:
++	 *
++	 * XXX: this is going to screw up disk accounting, extent triggers
++	 * assume things about extent overwrites - we should be running the
++	 * triggers manually here
++	 */
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
++			     BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 +
-+	return 0;
-+}
-+
-+noinline_for_stack
-+static int check_subvols(struct bch_fs *c)
-+{
-+	struct btree_trans trans;
-+	struct btree_iter iter;
-+	int ret;
-+
-+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-+
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
-+			     POS_MIN,
-+			     BTREE_ITER_INTENT|
-+			     BTREE_ITER_PREFETCH);
-+
-+	do {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_LAZY_RW|
-+				      BTREE_INSERT_NOFAIL,
-+				      check_subvol(&trans, &iter));
-+		if (ret)
-+			break;
-+	} while (bch2_btree_iter_advance(&iter));
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	bch2_trans_exit(&trans);
++	BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
++	ret   = bch2_btree_iter_traverse(&iter) ?:
++		bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
++		bch2_trans_commit(trans, NULL, NULL,
++				  BTREE_INSERT_NOFAIL|
++				  BTREE_INSERT_LAZY_RW);
++	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
++#endif
 +
 +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
 +						struct btree_iter *iter,
@@ -51981,15 +52513,15 @@ index 000000000..81bfd6ea2
 +{
 +	struct bch_fs *c = trans->c;
 +	struct inode_walker_entry *i;
-+	int ret = 0, ret2 = 0;
++	u32 restart_count = trans->restart_count;
++	int ret = 0;
 +	s64 count2;
 +
 +	darray_for_each(w->inodes, i) {
 +		if (i->inode.bi_sectors == i->count)
 +			continue;
 +
-+		count2 = lockrestart_do(trans,
-+			bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
++		count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot);
 +
 +		if (i->count != count2) {
 +			bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
@@ -52002,101 +52534,33 @@ index 000000000..81bfd6ea2
 +		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
 +			    "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
 +			    w->cur_inum, i->snapshot,
-+			    i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
-+			continue;
-+
-+		i->inode.bi_sectors = i->count;
-+		ret = write_inode(trans, &i->inode, i->snapshot);
-+		if (ret)
-+			break;
-+		ret2 = -EINTR;
-+	}
-+fsck_err:
-+	if (ret)
-+		bch_err(c, "error %i from check_i_sectors()", ret);
-+	return ret ?: ret2;
-+}
-+
-+struct extent_end {
-+	u32		snapshot;
-+	u64		offset;
-+};
-+
-+typedef DARRAY(struct extent_end) extent_ends;
-+
-+static int extent_ends_at(extent_ends *extent_ends,
-+			  struct bkey_s_c k)
-+{
-+	struct extent_end *i, n = (struct extent_end) {
-+		.snapshot	= k.k->p.snapshot,
-+		.offset		= k.k->p.offset,
-+	};
-+
-+	darray_for_each(*extent_ends, i) {
-+		if (i->snapshot == k.k->p.snapshot) {
-+			*i = n;
-+			return 0;
-+		}
-+
-+		if (i->snapshot >= k.k->p.snapshot)
-+			break;
-+	}
-+
-+	return darray_insert_item(extent_ends, i - extent_ends->data, n);
-+}
-+
-+static int check_extent_start(struct btree_trans *trans,
-+			      struct snapshots_seen *s,
-+			      extent_ends *extent_ends,
-+			      struct bkey_s_c k,
-+			      struct btree_iter *iter)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct extent_end *i;
-+	struct printbuf buf = PRINTBUF;
-+	int ret = 0;
-+
-+	darray_for_each(*extent_ends, i) {
-+		if (fsck_err_on(i->offset > bkey_start_offset(k.k) &&
-+				key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot), c,
-+				"overlapping extents: extent in snapshot %u ends at %llu overlaps with\n%s",
-+				i->snapshot,
-+				i->offset,
-+				(printbuf_reset(&buf),
-+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-+			struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-+			if ((ret = PTR_ERR_OR_ZERO(update)))
-+				goto err;
-+			bkey_reassemble(update, k);
-+			ret = bch2_trans_update_extent(trans, iter, update, 0);
-+			if (!ret)
-+				goto err;
++			    i->inode.bi_sectors, i->count)) {
++			i->inode.bi_sectors = i->count;
++			ret = write_inode(trans, &i->inode, i->snapshot);
++			if (ret)
++				break;
 +		}
 +	}
-+err:
 +fsck_err:
-+	printbuf_exit(&buf);
-+	return ret;
++	if (ret) {
++		bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret));
++		return ret;
++	}
++	if (trans_was_restarted(trans, restart_count))
++		return -BCH_ERR_transaction_restart_nested;
++	return 0;
 +}
 +
 +static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
++			struct bkey_s_c k,
 +			struct inode_walker *inode,
-+			struct snapshots_seen *s,
-+			extent_ends *extent_ends)
++			struct snapshots_seen *s)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bkey_s_c k;
 +	struct inode_walker_entry *i;
 +	struct printbuf buf = PRINTBUF;
++	struct bpos equiv;
 +	int ret = 0;
-+peek:
-+	k = bch2_btree_iter_peek(iter);
-+	if (!k.k)
-+		goto out;
-+
-+	ret = bkey_err(k);
-+	if (ret)
-+		goto err;
 +
 +	ret = check_key_has_snapshot(trans, iter, k);
 +	if (ret) {
@@ -52104,7 +52568,10 @@ index 000000000..81bfd6ea2
 +		goto out;
 +	}
 +
-+	ret = snapshots_seen_update(c, s, k.k->p);
++	equiv = k.k->p;
++	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
++
++	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
 +	if (ret)
 +		goto err;
 +
@@ -52112,23 +52579,28 @@ index 000000000..81bfd6ea2
 +		goto out;
 +
 +	if (inode->cur_inum != k.k->p.inode) {
-+		extent_ends->nr = 0;
-+
 +		ret = check_i_sectors(trans, inode);
 +		if (ret)
 +			goto err;
 +	}
 +
-+	if (!iter->path->should_be_locked) {
-+		/*
-+		 * hack: check_i_sectors may have handled a transaction restart,
-+		 * it shouldn't be but we need to fix the new i_sectors check
-+		 * code and delete the old bch2_count_inode_sectors() first
-+		 */
-+		goto peek;
-+	}
++	BUG_ON(!iter->path->should_be_locked);
++#if 0
++	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
++		char buf1[200];
++		char buf2[200];
 +
-+	ret = __walk_inode(trans, inode, k.k->p);
++		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
++		bch2_bkey_val_to_text(&PBUF(buf2), c, k);
++
++		if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
++			ret = fix_overlapping_extent(trans, k, prev.k->k.p)
++				?: -BCH_ERR_transaction_restart_nested;
++			goto out;
++		}
++	}
++#endif
++	ret = __walk_inode(trans, inode, equiv);
 +	if (ret < 0)
 +		goto err;
 +
@@ -52160,42 +52632,56 @@ index 000000000..81bfd6ea2
 +		goto out;
 +	}
 +
-+	if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
-+		for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
-+			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-+					k.k->type != KEY_TYPE_reservation &&
-+					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
-+					"extent type %u offset %llu past end of inode %llu, i_size %llu",
-+					k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
-+				bch2_fs_lazy_rw(c);
-+				ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-+						SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
-+						     k.k->p.snapshot),
-+						POS(k.k->p.inode, U64_MAX),
-+						0, NULL) ?: -EINTR;
-+				goto out;
++	/*
++	 * Check inodes in reverse order, from oldest snapshots to newest, so
++	 * that we emit the fewest number of whiteouts necessary:
++	 */
++	for (i = inode->inodes.data + inode->inodes.nr - 1;
++	     i >= inode->inodes.data;
++	     --i) {
++		if (i->snapshot > equiv.snapshot ||
++		    !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
++			continue;
++
++		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
++				k.k->type != KEY_TYPE_reservation &&
++				k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
++				"extent type past end of inode %llu:%u, i_size %llu\n  %s",
++				i->inode.bi_inum, i->snapshot, i->inode.bi_size,
++				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++			struct btree_iter iter2;
++
++			bch2_trans_copy_iter(&iter2, iter);
++			bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
++			ret =   bch2_btree_iter_traverse(&iter2) ?:
++				bch2_btree_delete_at(trans, &iter2,
++					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++			bch2_trans_iter_exit(trans, &iter2);
++			if (ret)
++				goto err;
++
++			if (i->snapshot != equiv.snapshot) {
++				ret = snapshots_seen_add(c, s, i->snapshot);
++				if (ret)
++					goto err;
 +			}
 +		}
 +	}
 +
-+	ret = check_extent_start(trans, s, extent_ends, k, iter);
-+	if (ret)
-+		goto err;
-+
 +	if (bkey_extent_is_allocation(k.k))
-+		for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
++		for_each_visible_inode(c, s, inode, equiv.snapshot, i)
 +			i->count += k.k->size;
++#if 0
++	bch2_bkey_buf_reassemble(&prev, c, k);
++#endif
 +
-+	ret = extent_ends_at(extent_ends, k);
-+	if (ret)
-+		goto err;
 +out:
 +err:
 +fsck_err:
 +	printbuf_exit(&buf);
 +
-+	if (ret && ret != -EINTR)
-+		bch_err(c, "error %i from check_extent()", ret);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from check_extent(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -52210,36 +52696,34 @@ index 000000000..81bfd6ea2
 +	struct snapshots_seen s;
 +	struct btree_trans trans;
 +	struct btree_iter iter;
-+	extent_ends extent_ends = { 0 };
++	struct bkey_s_c k;
 +	int ret = 0;
 +
++#if 0
++	struct bkey_buf prev;
++	bch2_bkey_buf_init(&prev);
++	prev.k->k = KEY(0, 0, 0);
++#endif
 +	snapshots_seen_init(&s);
 +	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 +
 +	bch_verbose(c, "checking extents");
 +
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-+			     POS(BCACHEFS_ROOT_INO, 0),
-+			     BTREE_ITER_INTENT|
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+
-+	do {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_LAZY_RW|
-+				      BTREE_INSERT_NOFAIL,
-+			check_extent(&trans, &iter, &w, &s, &extent_ends));
-+		if (ret)
-+			break;
-+	} while (bch2_btree_iter_advance(&iter));
-+	bch2_trans_iter_exit(&trans, &iter);
-+	darray_exit(&extent_ends);
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
++			POS(BCACHEFS_ROOT_INO, 0),
++			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++			NULL, NULL,
++			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_extent(&trans, &iter, k, &w, &s));
++#if 0
++	bch2_bkey_buf_exit(&prev, c);
++#endif
 +	inode_walker_exit(&w);
 +	bch2_trans_exit(&trans);
 +	snapshots_seen_exit(&s);
 +
 +	if (ret)
-+		bch_err(c, "error %i from check_extents()", ret);
++		bch_err(c, "error from check_extents(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -52247,7 +52731,8 @@ index 000000000..81bfd6ea2
 +{
 +	struct bch_fs *c = trans->c;
 +	struct inode_walker_entry *i;
-+	int ret = 0, ret2 = 0;
++	u32 restart_count = trans->restart_count;
++	int ret = 0;
 +	s64 count2;
 +
 +	darray_for_each(w->inodes, i) {
@@ -52273,13 +52758,16 @@ index 000000000..81bfd6ea2
 +			ret = write_inode(trans, &i->inode, i->snapshot);
 +			if (ret)
 +				break;
-+			ret2 = -EINTR;
 +		}
 +	}
 +fsck_err:
-+	if (ret)
-+		bch_err(c, "error %i from check_subdir_count()", ret);
-+	return ret ?: ret2;
++	if (ret) {
++		bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret));
++		return ret;
++	}
++	if (trans_was_restarted(trans, restart_count))
++		return -BCH_ERR_transaction_restart_nested;
++	return 0;
 +}
 +
 +static int check_dirent_target(struct btree_trans *trans,
@@ -52396,31 +52884,24 @@ index 000000000..81bfd6ea2
 +fsck_err:
 +	printbuf_exit(&buf);
 +
-+	if (ret && ret != -EINTR)
-+		bch_err(c, "error %i from check_target()", ret);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from check_target(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
 +static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
++			struct bkey_s_c k,
 +			struct bch_hash_info *hash_info,
 +			struct inode_walker *dir,
 +			struct inode_walker *target,
 +			struct snapshots_seen *s)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bkey_s_c k;
 +	struct bkey_s_c_dirent d;
 +	struct inode_walker_entry *i;
 +	struct printbuf buf = PRINTBUF;
++	struct bpos equiv;
 +	int ret = 0;
-+peek:
-+	k = bch2_btree_iter_peek(iter);
-+	if (!k.k)
-+		goto out;
-+
-+	ret = bkey_err(k);
-+	if (ret)
-+		goto err;
 +
 +	ret = check_key_has_snapshot(trans, iter, k);
 +	if (ret) {
@@ -52428,7 +52909,10 @@ index 000000000..81bfd6ea2
 +		goto out;
 +	}
 +
-+	ret = snapshots_seen_update(c, s, k.k->p);
++	equiv = k.k->p;
++	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
++
++	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
 +	if (ret)
 +		goto err;
 +
@@ -52441,12 +52925,9 @@ index 000000000..81bfd6ea2
 +			goto err;
 +	}
 +
-+	if (!iter->path->should_be_locked) {
-+		/* hack: see check_extent() */
-+		goto peek;
-+	}
++	BUG_ON(!iter->path->should_be_locked);
 +
-+	ret = __walk_inode(trans, dir, k.k->p);
++	ret = __walk_inode(trans, dir, equiv);
 +	if (ret < 0)
 +		goto err;
 +
@@ -52546,7 +53027,8 @@ index 000000000..81bfd6ea2
 +			goto err;
 +
 +		if (fsck_err_on(!target->inodes.nr, c,
-+				"dirent points to missing inode:\n%s",
++				"dirent points to missing inode: (equiv %u)\n%s",
++				equiv.snapshot,
 +				(printbuf_reset(&buf),
 +				 bch2_bkey_val_to_text(&buf, c, k),
 +				 buf.buf))) {
@@ -52564,7 +53046,7 @@ index 000000000..81bfd6ea2
 +	}
 +
 +	if (d.v->d_type == DT_DIR)
-+		for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
++		for_each_visible_inode(c, s, dir, equiv.snapshot, i)
 +			i->count++;
 +
 +out:
@@ -52572,8 +53054,8 @@ index 000000000..81bfd6ea2
 +fsck_err:
 +	printbuf_exit(&buf);
 +
-+	if (ret && ret != -EINTR)
-+		bch_err(c, "error %i from check_dirent()", ret);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -52590,6 +53072,7 @@ index 000000000..81bfd6ea2
 +	struct bch_hash_info hash_info;
 +	struct btree_trans trans;
 +	struct btree_iter iter;
++	struct bkey_s_c k;
 +	int ret = 0;
 +
 +	bch_verbose(c, "checking dirents");
@@ -52597,22 +53080,13 @@ index 000000000..81bfd6ea2
 +	snapshots_seen_init(&s);
 +	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 +
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
-+			     POS(BCACHEFS_ROOT_INO, 0),
-+			     BTREE_ITER_INTENT|
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+
-+	do {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_LAZY_RW|
-+				      BTREE_INSERT_NOFAIL,
-+			check_dirent(&trans, &iter, &hash_info,
-+				     &dir, &target, &s));
-+		if (ret)
-+			break;
-+	} while (bch2_btree_iter_advance(&iter));
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
++			POS(BCACHEFS_ROOT_INO, 0),
++			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
++			k,
++			NULL, NULL,
++			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
 +
 +	bch2_trans_exit(&trans);
 +	snapshots_seen_exit(&s);
@@ -52620,26 +53094,18 @@ index 000000000..81bfd6ea2
 +	inode_walker_exit(&target);
 +
 +	if (ret)
-+		bch_err(c, "error %i from check_dirents()", ret);
++		bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
 +static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
++		       struct bkey_s_c k,
 +		       struct bch_hash_info *hash_info,
 +		       struct inode_walker *inode)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bkey_s_c k;
 +	int ret;
 +
-+	k = bch2_btree_iter_peek(iter);
-+	if (!k.k)
-+		return 0;
-+
-+	ret = bkey_err(k);
-+	if (ret)
-+		return ret;
-+
 +	ret = check_key_has_snapshot(trans, iter, k);
 +	if (ret)
 +		return ret;
@@ -52663,8 +53129,8 @@ index 000000000..81bfd6ea2
 +
 +	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 +fsck_err:
-+	if (ret && ret != -EINTR)
-+		bch_err(c, "error %i from check_xattr()", ret);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -52678,33 +53144,25 @@ index 000000000..81bfd6ea2
 +	struct bch_hash_info hash_info;
 +	struct btree_trans trans;
 +	struct btree_iter iter;
++	struct bkey_s_c k;
 +	int ret = 0;
 +
 +	bch_verbose(c, "checking xattrs");
 +
 +	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 +
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
-+			     POS(BCACHEFS_ROOT_INO, 0),
-+			     BTREE_ITER_INTENT|
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+
-+	do {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_LAZY_RW|
-+				      BTREE_INSERT_NOFAIL,
-+				      check_xattr(&trans, &iter, &hash_info,
-+						  &inode));
-+		if (ret)
-+			break;
-+	} while (bch2_btree_iter_advance(&iter));
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
++			POS(BCACHEFS_ROOT_INO, 0),
++			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
++			k,
++			NULL, NULL,
++			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_xattr(&trans, &iter, k, &hash_info, &inode));
 +
 +	bch2_trans_exit(&trans);
 +
 +	if (ret)
-+		bch_err(c, "error %i from check_xattrs()", ret);
++		bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -52731,12 +53189,12 @@ index 000000000..81bfd6ea2
 +		root_subvol.v.flags	= 0;
 +		root_subvol.v.snapshot	= cpu_to_le32(snapshot);
 +		root_subvol.v.inode	= cpu_to_le64(inum);
-+		ret = __bch2_trans_do(trans, NULL, NULL,
++		ret = commit_do(trans, NULL, NULL,
 +				      BTREE_INSERT_NOFAIL|
 +				      BTREE_INSERT_LAZY_RW,
 +			__bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
 +		if (ret) {
-+			bch_err(c, "error writing root subvol: %i", ret);
++			bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
 +			goto err;
 +		}
 +
@@ -52755,7 +53213,7 @@ index 000000000..81bfd6ea2
 +
 +		ret = __write_inode(trans, &root_inode, snapshot);
 +		if (ret)
-+			bch_err(c, "error writing root inode: %i", ret);
++			bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
 +	}
 +err:
 +fsck_err:
@@ -52820,7 +53278,7 @@ index 000000000..81bfd6ea2
 +	struct bch_fs *c = trans->c;
 +	int ret = 0;
 +
-+	snapshot = snapshot_t(c, snapshot)->equiv;
++	snapshot = bch2_snapshot_equiv(c, snapshot);
 +	p->nr = 0;
 +
 +	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
@@ -52894,7 +53352,7 @@ index 000000000..81bfd6ea2
 +			if (!fsck_err(c, "directory structure loop"))
 +				return 0;
 +
-+			ret = __bch2_trans_do(trans, NULL, NULL,
++			ret = commit_do(trans, NULL, NULL,
 +					      BTREE_INSERT_NOFAIL|
 +					      BTREE_INSERT_LAZY_RW,
 +					remove_backpointer(trans, inode));
@@ -52908,7 +53366,7 @@ index 000000000..81bfd6ea2
 +	}
 +fsck_err:
 +	if (ret)
-+		bch_err(c, "%s: err %i", __func__, ret);
++		bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -52952,8 +53410,6 @@ index 000000000..81bfd6ea2
 +	}
 +	bch2_trans_iter_exit(&trans, &iter);
 +
-+	BUG_ON(ret == -EINTR);
-+
 +	darray_exit(&path);
 +
 +	bch2_trans_exit(&trans);
@@ -53105,7 +53561,7 @@ index 000000000..81bfd6ea2
 +			   BTREE_ITER_INTENT|
 +			   BTREE_ITER_PREFETCH|
 +			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-+		ret = snapshots_seen_update(c, &s, k.k->p);
++		ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
 +		if (ret)
 +			break;
 +
@@ -53117,7 +53573,7 @@ index 000000000..81bfd6ea2
 +			    d.v->d_type != DT_SUBVOL)
 +				inc_link(c, &s, links, range_start, range_end,
 +					 le64_to_cpu(d.v->d_inum),
-+					 d.k->p.snapshot);
++					 bch2_snapshot_equiv(c, d.k->p.snapshot));
 +			break;
 +		}
 +	}
@@ -53131,6 +53587,47 @@ index 000000000..81bfd6ea2
 +	return ret;
 +}
 +
++static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
++				     struct bkey_s_c k,
++				     struct nlink_table *links,
++				     size_t *idx, u64 range_end)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_inode_unpacked u;
++	struct nlink *link = &links->d[*idx];
++	int ret = 0;
++
++	if (k.k->p.offset >= range_end)
++		return 1;
++
++	if (!bkey_is_inode(k.k))
++		return 0;
++
++	BUG_ON(bch2_inode_unpack(k, &u));
++
++	if (S_ISDIR(le16_to_cpu(u.bi_mode)))
++		return 0;
++
++	if (!u.bi_nlink)
++		return 0;
++
++	while ((cmp_int(link->inum, k.k->p.offset) ?:
++		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
++		BUG_ON(*idx == links->nr);
++		link = &links->d[++*idx];
++	}
++
++	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
++			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
++			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
++			bch2_inode_nlink_get(&u), link->count)) {
++		bch2_inode_nlink_set(&u, link->count);
++		ret = __write_inode(trans, &u, k.k->p.snapshot);
++	}
++fsck_err:
++	return ret;
++}
++
 +noinline_for_stack
 +static int check_nlinks_update_hardlinks(struct bch_fs *c,
 +			       struct nlink_table *links,
@@ -53139,56 +53636,25 @@ index 000000000..81bfd6ea2
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	struct bch_inode_unpacked u;
-+	struct nlink *link = links->d;
++	size_t idx = 0;
 +	int ret = 0;
 +
 +	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
-+			   POS(0, range_start),
-+			   BTREE_ITER_INTENT|
-+			   BTREE_ITER_PREFETCH|
-+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-+		if (k.k->p.offset >= range_end)
-+			break;
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
++			POS(0, range_start),
++			BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
 +
-+		if (!bkey_is_inode(k.k))
-+			continue;
-+
-+		BUG_ON(bch2_inode_unpack(k, &u));
-+
-+		if (S_ISDIR(le16_to_cpu(u.bi_mode)))
-+			continue;
-+
-+		if (!u.bi_nlink)
-+			continue;
-+
-+		while ((cmp_int(link->inum, k.k->p.offset) ?:
-+			cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
-+			link++;
-+			BUG_ON(link >= links->d + links->nr);
-+		}
-+
-+		if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
-+				"inode %llu type %s has wrong i_nlink (%u, should be %u)",
-+				u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
-+				bch2_inode_nlink_get(&u), link->count)) {
-+			bch2_inode_nlink_set(&u, link->count);
-+
-+			ret = write_inode(&trans, &u, k.k->p.snapshot);
-+			if (ret)
-+				bch_err(c, "error in fsck: error %i updating inode", ret);
-+		}
-+	}
-+fsck_err:
-+	bch2_trans_iter_exit(&trans, &iter);
 +	bch2_trans_exit(&trans);
 +
-+	if (ret)
++	if (ret < 0) {
 +		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
++		return ret;
++	}
 +
-+	return ret;
++	return 0;
 +}
 +
 +noinline_for_stack
@@ -53228,21 +53694,13 @@ index 000000000..81bfd6ea2
 +	return ret;
 +}
 +
-+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
++static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
++			     struct bkey_s_c k)
 +{
-+	struct bkey_s_c k;
 +	struct bkey_s_c_reflink_p p;
 +	struct bkey_i_reflink_p *u;
 +	int ret;
 +
-+	k = bch2_btree_iter_peek(iter);
-+	if (!k.k)
-+		return 0;
-+
-+	ret = bkey_err(k);
-+	if (ret)
-+		return ret;
-+
 +	if (k.k->type != KEY_TYPE_reflink_p)
 +		return 0;
 +
@@ -53278,20 +53736,11 @@ index 000000000..81bfd6ea2
 +
 +	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
-+			   BTREE_ITER_INTENT|
-+			   BTREE_ITER_PREFETCH|
-+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-+		if (k.k->type == KEY_TYPE_reflink_p) {
-+			ret = __bch2_trans_do(&trans, NULL, NULL,
-+					      BTREE_INSERT_NOFAIL|
-+					      BTREE_INSERT_LAZY_RW,
-+					      fix_reflink_p_key(&trans, &iter));
-+			if (ret)
-+				break;
-+		}
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_extents, POS_MIN,
++			BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		fix_reflink_p_key(&trans, &iter, k));
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -53303,9 +53752,12 @@ index 000000000..81bfd6ea2
 + */
 +int bch2_fsck_full(struct bch_fs *c)
 +{
-+	return  bch2_fs_snapshots_check(c) ?:
++	int ret;
++again:
++	ret =   bch2_fs_check_snapshots(c) ?:
++		bch2_fs_check_subvols(c) ?:
++		bch2_delete_dead_snapshots(c) ?:
 +		check_inodes(c, true) ?:
-+		check_subvols(c) ?:
 +		check_extents(c) ?:
 +		check_dirents(c) ?:
 +		check_xattrs(c) ?:
@@ -53313,15 +53765,25 @@ index 000000000..81bfd6ea2
 +		check_directory_structure(c) ?:
 +		check_nlinks(c) ?:
 +		fix_reflink_p(c);
++
++	if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
++		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++		goto again;
++	}
++
++	return ret;
 +}
 +
 +int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 +{
-+	return check_inodes(c, false);
++	return  bch2_fs_check_snapshots(c) ?:
++		bch2_fs_check_subvols(c) ?:
++		bch2_delete_dead_snapshots(c) ?:
++		check_inodes(c, false);
 +}
 diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
 new file mode 100644
-index 000000000..264f2706b
+index 000000000000..264f2706b12d
 --- /dev/null
 +++ b/fs/bcachefs/fsck.h
 @@ -0,0 +1,8 @@
@@ -53335,7 +53797,7 @@ index 000000000..264f2706b
 +#endif /* _BCACHEFS_FSCK_H */
 diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
 new file mode 100644
-index 000000000..6a2b94908
+index 000000000000..083106006747
 --- /dev/null
 +++ b/fs/bcachefs/inode.c
 @@ -0,0 +1,771 @@
@@ -53980,7 +54442,7 @@ index 000000000..6a2b94908
 +		      bch2_trans_commit(trans, NULL, NULL,
 +					BTREE_INSERT_NOFAIL);
 +err:
-+		if (ret && ret != -EINTR)
++		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			break;
 +	}
 +
@@ -54051,7 +54513,7 @@ index 000000000..6a2b94908
 +				BTREE_INSERT_NOFAIL);
 +err:
 +	bch2_trans_iter_exit(&trans, &iter);
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	bch2_trans_exit(&trans);
@@ -54112,7 +54574,7 @@ index 000000000..6a2b94908
 +}
 diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
 new file mode 100644
-index 000000000..2ac2fc105
+index 000000000000..2ac2fc10513b
 --- /dev/null
 +++ b/fs/bcachefs/inode.h
 @@ -0,0 +1,189 @@
@@ -54307,10 +54769,10 @@ index 000000000..2ac2fc105
 +#endif /* _BCACHEFS_INODE_H */
 diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
 new file mode 100644
-index 000000000..50fa57234
+index 000000000000..971f8ba00dbd
 --- /dev/null
 +++ b/fs/bcachefs/io.c
-@@ -0,0 +1,2417 @@
+@@ -0,0 +1,2422 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Some low level IO code, and hacks for various block layer limitations
@@ -54625,7 +55087,7 @@ index 000000000..50fa57234
 +}
 +
 +/*
-+ * Returns -EINTR if we had to drop locks:
++ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
 + */
 +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 +		   subvol_inum inum, u64 end,
@@ -54638,7 +55100,8 @@ index 000000000..50fa57234
 +	int ret = 0, ret2 = 0;
 +	u32 snapshot;
 +
-+	while (!ret || ret == -EINTR) {
++	while (!ret ||
++	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 +		struct disk_reservation disk_res =
 +			bch2_disk_reservation_init(c, 0);
 +		struct bkey_i delete;
@@ -54697,7 +55160,10 @@ index 000000000..50fa57234
 +	bch2_trans_iter_exit(&trans, &iter);
 +	bch2_trans_exit(&trans);
 +
-+	return ret == -EINTR ? 0 : ret;
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		ret = 0;
++
++	return ret;
 +}
 +
 +int bch2_write_index_default(struct bch_write_op *op)
@@ -54728,7 +55194,7 @@ index 000000000..50fa57234
 +
 +		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
 +						  &sk.k->k.p.snapshot);
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			continue;
 +		if (ret)
 +			break;
@@ -54743,7 +55209,7 @@ index 000000000..50fa57234
 +					 op->flags & BCH_WRITE_CHECK_ENOSPC);
 +		bch2_trans_iter_exit(&trans, &iter);
 +
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			continue;
 +		if (ret)
 +			break;
@@ -54893,7 +55359,7 @@ index 000000000..50fa57234
 +		u64 sectors_start = keylist_sectors(keys);
 +		int ret = op->index_update_fn(op);
 +
-+		BUG_ON(ret == -EINTR);
++		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 +		BUG_ON(keylist_sectors(keys) && !ret);
 +
 +		op->written += sectors_start - keylist_sectors(keys);
@@ -56629,10 +57095,9 @@ index 000000000..50fa57234
 +		 * read_extent -> io_time_reset may cause a transaction restart
 +		 * without returning an error, we need to check for that here:
 +		 */
-+		if (!bch2_trans_relock(&trans)) {
-+			ret = -EINTR;
++		ret = bch2_trans_relock(&trans);
++		if (ret)
 +			break;
-+		}
 +
 +		bch2_btree_iter_set_pos(&iter,
 +				POS(inum.inum, bvec_iter.bi_sector));
@@ -56686,7 +57151,9 @@ index 000000000..50fa57234
 +err:
 +	bch2_trans_iter_exit(&trans, &iter);
 +
-+	if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
++	    ret == READ_RETRY ||
++	    ret == READ_RETRY_AVOID)
 +		goto retry;
 +
 +	bch2_trans_exit(&trans);
@@ -56730,7 +57197,7 @@ index 000000000..50fa57234
 +}
 diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
 new file mode 100644
-index 000000000..fb5114518
+index 000000000000..fb5114518666
 --- /dev/null
 +++ b/fs/bcachefs/io.h
 @@ -0,0 +1,189 @@
@@ -56925,7 +57392,7 @@ index 000000000..fb5114518
 +#endif /* _BCACHEFS_IO_H */
 diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
 new file mode 100644
-index 000000000..78bff13d3
+index 000000000000..78bff13d36f2
 --- /dev/null
 +++ b/fs/bcachefs/io_types.h
 @@ -0,0 +1,161 @@
@@ -57092,7 +57559,7 @@ index 000000000..78bff13d3
 +#endif /* _BCACHEFS_IO_TYPES_H */
 diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
 new file mode 100644
-index 000000000..b561ed787
+index 000000000000..937ed1395e46
 --- /dev/null
 +++ b/fs/bcachefs/journal.c
 @@ -0,0 +1,1429 @@
@@ -57981,7 +58448,7 @@ index 000000000..b561ed787
 +
 +	if (!new_fs) {
 +		for (i = 0; i < nr_got; i++) {
-+			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
++			ret = bch2_trans_run(c,
 +				bch2_trans_mark_metadata_bucket(&trans, ca,
 +						bu[i], BCH_DATA_journal,
 +						ca->mi.bucket_size));
@@ -58527,7 +58994,7 @@ index 000000000..b561ed787
 +}
 diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
 new file mode 100644
-index 000000000..d3caa7ea7
+index 000000000000..d3caa7ea7ce9
 --- /dev/null
 +++ b/fs/bcachefs/journal.h
 @@ -0,0 +1,521 @@
@@ -59054,7 +59521,7 @@ index 000000000..d3caa7ea7
 +#endif /* _BCACHEFS_JOURNAL_H */
 diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
 new file mode 100644
-index 000000000..0ff78a274
+index 000000000000..6fa2c54c1af4
 --- /dev/null
 +++ b/fs/bcachefs/journal_io.c
 @@ -0,0 +1,1735 @@
@@ -59257,7 +59724,7 @@ index 000000000..0ff78a274
 +		bch_err(c, "corrupt metadata before write:\n"		\
 +			msg, ##__VA_ARGS__);				\
 +		if (bch2_fs_inconsistent(c)) {				\
-+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
++			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 +			goto fsck_err;					\
 +		}							\
 +		break;							\
@@ -59918,7 +60385,7 @@ index 000000000..0ff78a274
 +				    end - offset, sectors_read,
 +				    READ);
 +		switch (ret) {
-+		case BCH_FSCK_OK:
++		case 0:
 +			sectors = vstruct_sectors(j, c->block_bits);
 +			break;
 +		case JOURNAL_ENTRY_REREAD:
@@ -60795,7 +61262,7 @@ index 000000000..0ff78a274
 +}
 diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
 new file mode 100644
-index 000000000..30e995c81
+index 000000000000..30e995c81fc4
 --- /dev/null
 +++ b/fs/bcachefs/journal_io.h
 @@ -0,0 +1,59 @@
@@ -60860,14 +61327,15 @@ index 000000000..30e995c81
 +#endif /* _BCACHEFS_JOURNAL_IO_H */
 diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
 new file mode 100644
-index 000000000..fdc94e831
+index 000000000000..6f0ab411c98e
 --- /dev/null
 +++ b/fs/bcachefs/journal_reclaim.c
-@@ -0,0 +1,849 @@
+@@ -0,0 +1,852 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "btree_key_cache.h"
++#include "errcode.h"
 +#include "error.h"
 +#include "journal.h"
 +#include "journal_io.h"
@@ -61606,15 +62074,17 @@ index 000000000..fdc94e831
 +{
 +	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 +	struct task_struct *p;
++	int ret;
 +
 +	if (j->reclaim_thread)
 +		return 0;
 +
 +	p = kthread_create(bch2_journal_reclaim_thread, j,
 +			   "bch-reclaim/%s", c->name);
-+	if (IS_ERR(p)) {
-+		bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
-+		return PTR_ERR(p);
++	ret = PTR_ERR_OR_ZERO(p);
++	if (ret) {
++		bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
++		return ret;
 +	}
 +
 +	get_task_struct(p);
@@ -61715,7 +62185,7 @@ index 000000000..fdc94e831
 +}
 diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
 new file mode 100644
-index 000000000..0fd1af120
+index 000000000000..0fd1af120db5
 --- /dev/null
 +++ b/fs/bcachefs/journal_reclaim.h
 @@ -0,0 +1,86 @@
@@ -61807,7 +62277,7 @@ index 000000000..0fd1af120
 +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
 diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
 new file mode 100644
-index 000000000..001cecec1
+index 000000000000..001cecec1291
 --- /dev/null
 +++ b/fs/bcachefs/journal_sb.c
 @@ -0,0 +1,220 @@
@@ -62033,7 +62503,7 @@ index 000000000..001cecec1
 +}
 diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
 new file mode 100644
-index 000000000..a39192e9f
+index 000000000000..a39192e9f6f4
 --- /dev/null
 +++ b/fs/bcachefs/journal_sb.h
 @@ -0,0 +1,24 @@
@@ -62063,7 +62533,7 @@ index 000000000..a39192e9f
 +int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
 diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
 new file mode 100644
-index 000000000..d9b4042a2
+index 000000000000..5c555b3703c0
 --- /dev/null
 +++ b/fs/bcachefs/journal_seq_blacklist.c
 @@ -0,0 +1,322 @@
@@ -62341,7 +62811,7 @@ index 000000000..d9b4042a2
 +		       !test_bit(BCH_FS_STOPPING, &c->flags))
 +			b = bch2_btree_iter_next_node(&iter);
 +
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			goto retry;
 +
 +		bch2_trans_iter_exit(&trans, &iter);
@@ -62391,7 +62861,7 @@ index 000000000..d9b4042a2
 +}
 diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
 new file mode 100644
-index 000000000..afb886ec8
+index 000000000000..afb886ec8e25
 --- /dev/null
 +++ b/fs/bcachefs/journal_seq_blacklist.h
 @@ -0,0 +1,22 @@
@@ -62419,7 +62889,7 @@ index 000000000..afb886ec8
 +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
 diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
 new file mode 100644
-index 000000000..a6cdb885a
+index 000000000000..a6cdb885ad41
 --- /dev/null
 +++ b/fs/bcachefs/journal_types.h
 @@ -0,0 +1,340 @@
@@ -62765,7 +63235,7 @@ index 000000000..a6cdb885a
 +#endif /* _BCACHEFS_JOURNAL_TYPES_H */
 diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
 new file mode 100644
-index 000000000..cda77835b
+index 000000000000..cda77835b9ea
 --- /dev/null
 +++ b/fs/bcachefs/keylist.c
 @@ -0,0 +1,67 @@
@@ -62838,7 +63308,7 @@ index 000000000..cda77835b
 +#endif
 diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
 new file mode 100644
-index 000000000..195799bb2
+index 000000000000..195799bb20bc
 --- /dev/null
 +++ b/fs/bcachefs/keylist.h
 @@ -0,0 +1,76 @@
@@ -62920,7 +63390,7 @@ index 000000000..195799bb2
 +#endif /* _BCACHEFS_KEYLIST_H */
 diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
 new file mode 100644
-index 000000000..4b3ff7d8a
+index 000000000000..4b3ff7d8a875
 --- /dev/null
 +++ b/fs/bcachefs/keylist_types.h
 @@ -0,0 +1,16 @@
@@ -62942,10 +63412,10 @@ index 000000000..4b3ff7d8a
 +#endif /* _BCACHEFS_KEYLIST_TYPES_H */
 diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
 new file mode 100644
-index 000000000..5a09b5500
+index 000000000000..53e607d72274
 --- /dev/null
 +++ b/fs/bcachefs/lru.c
-@@ -0,0 +1,219 @@
+@@ -0,0 +1,206 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -63078,25 +63548,18 @@ index 000000000..5a09b5500
 +}
 +
 +static int bch2_check_lru_key(struct btree_trans *trans,
-+			      struct btree_iter *lru_iter)
++			      struct btree_iter *lru_iter,
++			      struct bkey_s_c lru_k)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
-+	struct bkey_s_c lru_k, k;
++	struct bkey_s_c k;
 +	struct bch_alloc_v4 a;
 +	struct printbuf buf1 = PRINTBUF;
 +	struct printbuf buf2 = PRINTBUF;
 +	struct bpos alloc_pos;
 +	int ret;
 +
-+	lru_k = bch2_btree_iter_peek(lru_iter);
-+	if (!lru_k.k)
-+		return 0;
-+
-+	ret = bkey_err(lru_k);
-+	if (ret)
-+		return ret;
-+
 +	alloc_pos = POS(lru_k.k->p.inode,
 +			le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx));
 +
@@ -63150,16 +63613,10 @@ index 000000000..5a09b5500
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
-+			   BTREE_ITER_PREFETCH, k, ret) {
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_NOFAIL|
-+				      BTREE_INSERT_LAZY_RW,
-+			bch2_check_lru_key(&trans, &iter));
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		bch2_check_lru_key(&trans, &iter, k));
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -63167,7 +63624,7 @@ index 000000000..5a09b5500
 +}
 diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
 new file mode 100644
-index 000000000..3decb7b1d
+index 000000000000..3decb7b1dde2
 --- /dev/null
 +++ b/fs/bcachefs/lru.h
 @@ -0,0 +1,19 @@
@@ -63192,10 +63649,10 @@ index 000000000..3decb7b1d
 +#endif /* _BCACHEFS_LRU_H */
 diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
 new file mode 100644
-index 000000000..5345697f2
+index 000000000000..8b258d966d04
 --- /dev/null
 +++ b/fs/bcachefs/migrate.c
-@@ -0,0 +1,193 @@
+@@ -0,0 +1,186 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Code for moving data off a device.
@@ -63206,6 +63663,7 @@ index 000000000..5345697f2
 +#include "btree_update.h"
 +#include "btree_update_interior.h"
 +#include "buckets.h"
++#include "errcode.h"
 +#include "extents.h"
 +#include "io.h"
 +#include "journal.h"
@@ -63233,83 +63691,74 @@ index 000000000..5345697f2
 +	return 0;
 +}
 +
-+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
-+				   enum btree_id btree_id)
++static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
++				     struct btree_iter *iter,
++				     struct bkey_s_c k,
++				     unsigned dev_idx,
++				     int flags)
 +{
-+	struct btree_trans trans;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	struct bkey_buf sk;
-+	int ret = 0;
++	struct bch_fs *c = trans->c;
++	struct bkey_i *n;
++	int ret;
 +
-+	bch2_bkey_buf_init(&sk);
-+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++	if (!bch2_bkey_has_device(k, dev_idx))
++		return 0;
 +
-+	bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
++	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(n);
++	if (ret)
++		return ret;
 +
-+	while ((bch2_trans_begin(&trans),
-+		(k = bch2_btree_iter_peek(&iter)).k) &&
-+	       !(ret = bkey_err(k))) {
-+		if (!bch2_bkey_has_device(k, dev_idx)) {
-+			bch2_btree_iter_advance(&iter);
-+			continue;
-+		}
++	bkey_reassemble(n, k);
 +
-+		bch2_bkey_buf_reassemble(&sk, c, k);
++	ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
++	if (ret)
++		return ret;
 +
-+		ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
-+				    dev_idx, flags, false);
-+		if (ret)
-+			break;
++	/*
++	 * If the new extent no longer has any pointers, bch2_extent_normalize()
++	 * will do the appropriate thing with it (turning it into a
++	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
++	 */
++	bch2_extent_normalize(c, bkey_i_to_s(n));
 +
-+		/*
-+		 * If the new extent no longer has any pointers, bch2_extent_normalize()
-+		 * will do the appropriate thing with it (turning it into a
-+		 * KEY_TYPE_error key, or just a discard if it was a cached extent)
-+		 */
-+		bch2_extent_normalize(c, bkey_i_to_s(sk.k));
++	/*
++	 * Since we're not inserting through an extent iterator
++	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
++	 * we aren't using the extent overwrite path to delete, we're
++	 * just using the normal key deletion path:
++	 */
++	if (bkey_deleted(&n->k))
++		n->k.size = 0;
 +
-+		/*
-+		 * Since we're not inserting through an extent iterator
-+		 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
-+		 * we aren't using the extent overwrite path to delete, we're
-+		 * just using the normal key deletion path:
-+		 */
-+		if (bkey_deleted(&sk.k->k))
-+			sk.k->k.size = 0;
-+
-+		ret   = bch2_btree_iter_traverse(&iter) ?:
-+			bch2_trans_update(&trans, &iter, sk.k,
-+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-+			bch2_trans_commit(&trans, NULL, NULL,
-+					BTREE_INSERT_NOFAIL);
-+
-+		/*
-+		 * don't want to leave ret == -EINTR, since if we raced and
-+		 * something else overwrote the key we could spuriously return
-+		 * -EINTR below:
-+		 */
-+		if (ret == -EINTR)
-+			ret = 0;
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	bch2_trans_exit(&trans);
-+	bch2_bkey_buf_exit(&sk, c);
-+
-+	BUG_ON(ret == -EINTR);
-+
-+	return ret;
++	return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 +}
 +
 +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 +{
-+	return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?:
-+		__bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink);
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	enum btree_id id;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	for (id = 0; id < BTREE_ID_NR; id++) {
++		if (!btree_type_has_ptrs(id))
++			continue;
++
++		ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
++				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++				NULL, NULL, BTREE_INSERT_NOFAIL,
++			bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
++		if (ret)
++			break;
++	}
++
++	bch2_trans_exit(&trans);
++
++	return ret;
 +}
 +
 +static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
@@ -63352,19 +63801,20 @@ index 000000000..5345697f2
 +			}
 +
 +			ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
-+			if (ret == -EINTR) {
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 +				ret = 0;
 +				continue;
 +			}
 +
 +			if (ret) {
-+				bch_err(c, "Error updating btree node key: %i", ret);
++				bch_err(c, "Error updating btree node key: %s",
++					bch2_err_str(ret));
 +				break;
 +			}
 +next:
 +			bch2_btree_iter_next_node(&iter);
 +		}
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			goto retry;
 +
 +		bch2_trans_iter_exit(&trans, &iter);
@@ -63379,7 +63829,7 @@ index 000000000..5345697f2
 +	bch2_trans_exit(&trans);
 +	bch2_bkey_buf_exit(&k, c);
 +
-+	BUG_ON(ret == -EINTR);
++	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 +
 +	return ret;
 +}
@@ -63391,7 +63841,7 @@ index 000000000..5345697f2
 +}
 diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
 new file mode 100644
-index 000000000..027efaa0d
+index 000000000000..027efaa0d575
 --- /dev/null
 +++ b/fs/bcachefs/migrate.h
 @@ -0,0 +1,7 @@
@@ -63404,10 +63854,10 @@ index 000000000..027efaa0d
 +#endif /* _BCACHEFS_MIGRATE_H */
 diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
 new file mode 100644
-index 000000000..9748b8653
+index 000000000000..2fc247451390
 --- /dev/null
 +++ b/fs/bcachefs/move.c
-@@ -0,0 +1,951 @@
+@@ -0,0 +1,952 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -63419,6 +63869,7 @@ index 000000000..9748b8653
 +#include "btree_update_interior.h"
 +#include "disk_groups.h"
 +#include "ec.h"
++#include "errcode.h"
 +#include "error.h"
 +#include "inode.h"
 +#include "io.h"
@@ -63780,7 +64231,7 @@ index 000000000..9748b8653
 +	ret = lookup_inode(trans,
 +			   SPOS(0, k.k->p.inode, k.k->p.snapshot),
 +			   &inode);
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		return ret;
 +
 +	if (!ret)
@@ -63828,7 +64279,7 @@ index 000000000..9748b8653
 +			break;
 +
 +		ret = bkey_err(k);
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			continue;
 +		if (ret)
 +			break;
@@ -63859,7 +64310,7 @@ index 000000000..9748b8653
 +		ret2 = bch2_move_extent(&trans, ctxt, io_opts,
 +					btree_id, k, data_opts);
 +		if (ret2) {
-+			if (ret2 == -EINTR)
++			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
 +				continue;
 +
 +			if (ret2 == -ENOMEM) {
@@ -63984,7 +64435,7 @@ index 000000000..9748b8653
 +
 +		ret = bch2_get_next_backpointer(&trans, bucket, gen,
 +						&bp_offset, &bp);
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			continue;
 +		if (ret)
 +			goto err;
@@ -63999,7 +64450,7 @@ index 000000000..9748b8653
 +			k = bch2_backpointer_get_key(&trans, &iter,
 +						bucket, bp_offset, bp);
 +			ret = bkey_err(k);
-+			if (ret == -EINTR)
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +				continue;
 +			if (ret)
 +				goto err;
@@ -64026,7 +64477,7 @@ index 000000000..9748b8653
 +
 +			ret = bch2_move_extent(&trans, ctxt, io_opts,
 +					       bp.btree_id, k, data_opts);
-+			if (ret == -EINTR)
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +				continue;
 +			if (ret == -ENOMEM) {
 +				/* memory allocation failure, wait for some IO to finish */
@@ -64045,7 +64496,7 @@ index 000000000..9748b8653
 +			b = bch2_backpointer_get_node(&trans, &iter,
 +						bucket, bp_offset, bp);
 +			ret = PTR_ERR_OR_ZERO(b);
-+			if (ret == -EINTR)
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +				continue;
 +			if (ret)
 +				goto err;
@@ -64055,7 +64506,7 @@ index 000000000..9748b8653
 +			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
 +			bch2_trans_iter_exit(&trans, &iter);
 +
-+			if (ret == -EINTR)
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +				continue;
 +			if (ret)
 +				goto err;
@@ -64150,14 +64601,14 @@ index 000000000..9748b8653
 +				goto next;
 +
 +			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
-+			if (ret == -EINTR)
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +				continue;
 +			if (ret)
 +				break;
 +next:
 +			bch2_btree_iter_next_node(&iter);
 +		}
-+		if (ret == -EINTR)
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			goto retry;
 +
 +		bch2_trans_iter_exit(&trans, &iter);
@@ -64169,7 +64620,7 @@ index 000000000..9748b8653
 +	bch2_trans_exit(&trans);
 +
 +	if (ret)
-+		bch_err(c, "error %i in bch2_move_btree", ret);
++		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 +
 +	bch2_btree_interior_updates_flush(c);
 +
@@ -64361,7 +64812,7 @@ index 000000000..9748b8653
 +}
 diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
 new file mode 100644
-index 000000000..c0fec69bb
+index 000000000000..c0fec69bbb6a
 --- /dev/null
 +++ b/fs/bcachefs/move.h
 @@ -0,0 +1,67 @@
@@ -64434,7 +64885,7 @@ index 000000000..c0fec69bb
 +#endif /* _BCACHEFS_MOVE_H */
 diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
 new file mode 100644
-index 000000000..9df6d1813
+index 000000000000..9df6d18137a5
 --- /dev/null
 +++ b/fs/bcachefs/move_types.h
 @@ -0,0 +1,19 @@
@@ -64459,10 +64910,10 @@ index 000000000..9df6d1813
 +#endif /* _BCACHEFS_MOVE_TYPES_H */
 diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
 new file mode 100644
-index 000000000..f9ad4cb26
+index 000000000000..f913864eaa4f
 --- /dev/null
 +++ b/fs/bcachefs/movinggc.c
-@@ -0,0 +1,282 @@
+@@ -0,0 +1,285 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Moving/copying garbage collector
@@ -64478,6 +64929,7 @@ index 000000000..f9ad4cb26
 +#include "buckets.h"
 +#include "clock.h"
 +#include "disk_groups.h"
++#include "errcode.h"
 +#include "error.h"
 +#include "extents.h"
 +#include "eytzinger.h"
@@ -64627,7 +65079,7 @@ index 000000000..f9ad4cb26
 +	bch2_moving_ctxt_exit(&ctxt);
 +
 +	if (ret < 0)
-+		bch_err(c, "error %i from bch2_move_data() in copygc", ret);
++		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 +
 +	trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
 +	return ret;
@@ -64716,6 +65168,7 @@ index 000000000..f9ad4cb26
 +int bch2_copygc_start(struct bch_fs *c)
 +{
 +	struct task_struct *t;
++	int ret;
 +
 +	if (c->copygc_thread)
 +		return 0;
@@ -64727,9 +65180,10 @@ index 000000000..f9ad4cb26
 +		return -ENOMEM;
 +
 +	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
-+	if (IS_ERR(t)) {
-+		bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
-+		return PTR_ERR(t);
++	ret = PTR_ERR_OR_ZERO(t);
++	if (ret) {
++		bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
++		return ret;
 +	}
 +
 +	get_task_struct(t);
@@ -64747,14 +65201,15 @@ index 000000000..f9ad4cb26
 +}
 diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
 new file mode 100644
-index 000000000..922738247
+index 000000000000..e85c8136a46e
 --- /dev/null
 +++ b/fs/bcachefs/movinggc.h
-@@ -0,0 +1,9 @@
+@@ -0,0 +1,10 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_MOVINGGC_H
 +#define _BCACHEFS_MOVINGGC_H
 +
++unsigned long bch2_copygc_wait_amount(struct bch_fs *);
 +void bch2_copygc_stop(struct bch_fs *);
 +int bch2_copygc_start(struct bch_fs *);
 +void bch2_fs_copygc_init(struct bch_fs *);
@@ -64762,7 +65217,7 @@ index 000000000..922738247
 +#endif /* _BCACHEFS_MOVINGGC_H */
 diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
 new file mode 100644
-index 000000000..407b221e8
+index 000000000000..407b221e8f6c
 --- /dev/null
 +++ b/fs/bcachefs/opts.c
 @@ -0,0 +1,578 @@
@@ -65346,10 +65801,10 @@ index 000000000..407b221e8
 +}
 diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
 new file mode 100644
-index 000000000..2f5f49cb7
+index 000000000000..5b8586ecb374
 --- /dev/null
 +++ b/fs/bcachefs/opts.h
-@@ -0,0 +1,504 @@
+@@ -0,0 +1,509 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_OPTS_H
 +#define _BCACHEFS_OPTS_H
@@ -65693,6 +66148,11 @@ index 000000000..2f5f49cb7
 +	  OPT_BOOL(),							\
 +	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		"Don't open device in exclusive mode")		\
++	x(direct_io,			u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			true,			\
++	  NULL,		"Use O_DIRECT (userspace only)")		\
 +	x(sb,				u64,				\
 +	  OPT_MOUNT,							\
 +	  OPT_UINT(0, S64_MAX),						\
@@ -65856,13 +66316,14 @@ index 000000000..2f5f49cb7
 +#endif /* _BCACHEFS_OPTS_H */
 diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
 new file mode 100644
-index 000000000..d764dc7ab
+index 000000000000..454c76e03be9
 --- /dev/null
 +++ b/fs/bcachefs/quota.c
-@@ -0,0 +1,859 @@
+@@ -0,0 +1,823 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "btree_update.h"
++#include "errcode.h"
 +#include "inode.h"
 +#include "quota.h"
 +#include "subvolume.h"
@@ -66232,6 +66693,9 @@ index 000000000..d764dc7ab
 +
 +	BUG_ON(k.k->p.inode >= QTYP_NR);
 +
++	if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
++		return 0;
++
 +	switch (k.k->type) {
 +	case KEY_TYPE_quota:
 +		dq = bkey_s_c_to_quota(k);
@@ -66255,30 +66719,6 @@ index 000000000..d764dc7ab
 +	return 0;
 +}
 +
-+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
-+{
-+	struct btree_trans trans;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	int ret = 0;
-+
-+	bch2_trans_init(&trans, c, 0, 0);
-+
-+	for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0),
-+			   BTREE_ITER_PREFETCH, k, ret) {
-+		if (k.k->p.inode != type)
-+			break;
-+
-+		ret = __bch2_quota_set(c, k);
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	bch2_trans_exit(&trans);
-+	return ret;
-+}
-+
 +void bch2_fs_quota_exit(struct bch_fs *c)
 +{
 +	unsigned i;
@@ -66317,22 +66757,14 @@ index 000000000..d764dc7ab
 +}
 +
 +static int bch2_fs_quota_read_inode(struct btree_trans *trans,
-+				    struct btree_iter *iter)
++				    struct btree_iter *iter,
++				    struct bkey_s_c k)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct bch_inode_unpacked u;
 +	struct bch_subvolume subvolume;
-+	struct bkey_s_c k;
 +	int ret;
 +
-+	k = bch2_btree_iter_peek(iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		return ret;
-+
-+	if (!k.k)
-+		return 1;
-+
 +	ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
 +	if (ret)
 +		return ret;
@@ -66361,36 +66793,28 @@ index 000000000..d764dc7ab
 +
 +int bch2_fs_quota_read(struct bch_fs *c)
 +{
-+	unsigned i, qtypes = enabled_qtypes(c);
-+	struct bch_memquota_type *q;
 +	struct btree_trans trans;
 +	struct btree_iter iter;
++	struct bkey_s_c k;
 +	int ret;
 +
 +	mutex_lock(&c->sb_lock);
 +	bch2_sb_quota_read(c);
 +	mutex_unlock(&c->sb_lock);
 +
-+	for_each_set_qtype(c, i, q, qtypes) {
-+		ret = bch2_quota_init_type(c, i);
-+		if (ret)
-+			return ret;
-+	}
-+
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
-+			     BTREE_ITER_INTENT|
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+	do {
-+		ret = lockrestart_do(&trans,
-+				     bch2_fs_quota_read_inode(&trans, &iter));
-+	} while (!ret);
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
++			POS_MIN, BTREE_ITER_PREFETCH, k,
++		__bch2_quota_set(c, k)) ?:
++	      for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
++			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++		bch2_fs_quota_read_inode(&trans, &iter, k));
++	if (ret)
++		bch_err(c, "err in quota_read: %s", bch2_err_str(ret));
 +
 +	bch2_trans_exit(&trans);
-+	return ret < 0 ? ret : 0;
++	return ret;
 +}
 +
 +/* Enable/disable/delete quotas for an entire filesystem: */
@@ -66721,7 +67145,7 @@ index 000000000..d764dc7ab
 +#endif /* CONFIG_BCACHEFS_QUOTA */
 diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
 new file mode 100644
-index 000000000..8c67ae1da
+index 000000000000..8c67ae1da7c7
 --- /dev/null
 +++ b/fs/bcachefs/quota.h
 @@ -0,0 +1,71 @@
@@ -66798,7 +67222,7 @@ index 000000000..8c67ae1da
 +#endif /* _BCACHEFS_QUOTA_H */
 diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
 new file mode 100644
-index 000000000..6a136083d
+index 000000000000..6a136083d389
 --- /dev/null
 +++ b/fs/bcachefs/quota_types.h
 @@ -0,0 +1,43 @@
@@ -66847,10 +67271,10 @@ index 000000000..6a136083d
 +#endif /* _BCACHEFS_QUOTA_TYPES_H */
 diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
 new file mode 100644
-index 000000000..31da40933
+index 000000000000..ecc64dd92b05
 --- /dev/null
 +++ b/fs/bcachefs/rebalance.c
-@@ -0,0 +1,358 @@
+@@ -0,0 +1,361 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -66859,6 +67283,7 @@ index 000000000..31da40933
 +#include "buckets.h"
 +#include "clock.h"
 +#include "disk_groups.h"
++#include "errcode.h"
 +#include "extents.h"
 +#include "io.h"
 +#include "move.h"
@@ -67184,6 +67609,7 @@ index 000000000..31da40933
 +int bch2_rebalance_start(struct bch_fs *c)
 +{
 +	struct task_struct *p;
++	int ret;
 +
 +	if (c->rebalance.thread)
 +		return 0;
@@ -67192,9 +67618,10 @@ index 000000000..31da40933
 +		return 0;
 +
 +	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
-+	if (IS_ERR(p)) {
-+		bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
-+		return PTR_ERR(p);
++	ret = PTR_ERR_OR_ZERO(p);
++	if (ret) {
++		bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
++		return ret;
 +	}
 +
 +	get_task_struct(p);
@@ -67211,7 +67638,7 @@ index 000000000..31da40933
 +}
 diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
 new file mode 100644
-index 000000000..7ade0bb81
+index 000000000000..7ade0bb81cce
 --- /dev/null
 +++ b/fs/bcachefs/rebalance.h
 @@ -0,0 +1,28 @@
@@ -67245,7 +67672,7 @@ index 000000000..7ade0bb81
 +#endif /* _BCACHEFS_REBALANCE_H */
 diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
 new file mode 100644
-index 000000000..7462a92e9
+index 000000000000..7462a92e9598
 --- /dev/null
 +++ b/fs/bcachefs/rebalance_types.h
 @@ -0,0 +1,26 @@
@@ -67277,10 +67704,10 @@ index 000000000..7462a92e9
 +#endif /* _BCACHEFS_REBALANCE_TYPES_H */
 diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
 new file mode 100644
-index 000000000..eea025a83
+index 000000000000..b070bdf01500
 --- /dev/null
 +++ b/fs/bcachefs/recovery.c
-@@ -0,0 +1,1584 @@
+@@ -0,0 +1,1597 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -67294,6 +67721,7 @@ index 000000000..eea025a83
 +#include "buckets.h"
 +#include "dirent.h"
 +#include "ec.h"
++#include "errcode.h"
 +#include "error.h"
 +#include "fs-common.h"
 +#include "fsck.h"
@@ -67370,9 +67798,9 @@ index 000000000..eea025a83
 +	return keys->d + idx_to_pos(keys, idx);
 +}
 +
-+static size_t bch2_journal_key_search(struct journal_keys *keys,
-+				      enum btree_id id, unsigned level,
-+				      struct bpos pos)
++static size_t __bch2_journal_key_search(struct journal_keys *keys,
++					enum btree_id id, unsigned level,
++					struct bpos pos)
 +{
 +	size_t l = 0, r = keys->nr, m;
 +
@@ -67390,7 +67818,14 @@ index 000000000..eea025a83
 +	BUG_ON(l &&
 +	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
 +
-+	return idx_to_pos(keys, l);
++	return l;
++}
++
++static size_t bch2_journal_key_search(struct journal_keys *keys,
++				      enum btree_id id, unsigned level,
++				      struct bpos pos)
++{
++	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
 +}
 +
 +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
@@ -67399,22 +67834,21 @@ index 000000000..eea025a83
 +{
 +	struct journal_keys *keys = &c->journal_keys;
 +	unsigned iters = 0;
++	struct journal_key *k;
 +search:
 +	if (!*idx)
-+		*idx = bch2_journal_key_search(keys, btree_id, level, pos);
++		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
 +
-+	while (*idx < keys->size &&
-+	       keys->d[*idx].btree_id == btree_id &&
-+	       keys->d[*idx].level == level &&
-+	       bpos_cmp(keys->d[*idx].k->k.p, end_pos) <= 0) {
-+		if (bpos_cmp(keys->d[*idx].k->k.p, pos) >= 0 &&
-+		    !keys->d[*idx].overwritten)
-+			return keys->d[*idx].k;
++	while (*idx < keys->nr &&
++	       (k = idx_to_key(keys, *idx),
++		k->btree_id == btree_id &&
++		k->level == level &&
++		bpos_cmp(k->k->k.p, end_pos) <= 0)) {
++		if (bpos_cmp(k->k->k.p, pos) >= 0 &&
++		    !k->overwritten)
++			return k->k;
 +
 +		(*idx)++;
-+		if (*idx == keys->gap)
-+			*idx += keys->size - keys->nr;
-+
 +		iters++;
 +		if (iters == 10) {
 +			*idx = 0;
@@ -68436,7 +68870,7 @@ index 000000000..eea025a83
 +use_clean:
 +		if (!clean) {
 +			bch_err(c, "no superblock clean section found");
-+			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
++			ret = -BCH_ERR_fsck_repair_impossible;
 +			goto err;
 +
 +		}
@@ -68711,10 +69145,16 @@ index 000000000..eea025a83
 +		bch2_journal_entries_free(c);
 +	}
 +	kfree(clean);
++
++	if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
++		bch2_fs_read_write_early(c);
++		bch2_delete_dead_snapshots_async(c);
++	}
++
 +	if (ret)
-+		bch_err(c, "Error in recovery: %s (%i)", err, ret);
++		bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret));
 +	else
-+		bch_verbose(c, "ret %i", ret);
++		bch_verbose(c, "ret %s", bch2_err_str(ret));
 +	return ret;
 +err:
 +fsck_err:
@@ -68867,7 +69307,7 @@ index 000000000..eea025a83
 +}
 diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
 new file mode 100644
-index 000000000..8c0348e8b
+index 000000000000..8c0348e8b84c
 --- /dev/null
 +++ b/fs/bcachefs/recovery.h
 @@ -0,0 +1,58 @@
@@ -68931,10 +69371,10 @@ index 000000000..8c0348e8b
 +#endif /* _BCACHEFS_RECOVERY_H */
 diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
 new file mode 100644
-index 000000000..2038e3502
+index 000000000000..d5c14bb2992d
 --- /dev/null
 +++ b/fs/bcachefs/reflink.c
-@@ -0,0 +1,421 @@
+@@ -0,0 +1,422 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "bkey_buf.h"
@@ -69236,7 +69676,8 @@ index 000000000..2038e3502
 +	bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
 +			     BTREE_ITER_INTENT);
 +
-+	while ((ret == 0 || ret == -EINTR) &&
++	while ((ret == 0 ||
++		bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
 +	       bkey_cmp(dst_iter.pos, dst_end) < 0) {
 +		struct disk_reservation disk_res = { 0 };
 +
@@ -69346,7 +69787,7 @@ index 000000000..2038e3502
 +		}
 +
 +		bch2_trans_iter_exit(&trans, &inode_iter);
-+	} while (ret2 == -EINTR);
++	} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
 +
 +	bch2_trans_exit(&trans);
 +	bch2_bkey_buf_exit(&new_src, c);
@@ -69358,7 +69799,7 @@ index 000000000..2038e3502
 +}
 diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
 new file mode 100644
-index 000000000..f9848dc3e
+index 000000000000..f9848dc3eebb
 --- /dev/null
 +++ b/fs/bcachefs/reflink.h
 @@ -0,0 +1,76 @@
@@ -69440,7 +69881,7 @@ index 000000000..f9848dc3e
 +#endif /* _BCACHEFS_REFLINK_H */
 diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
 new file mode 100644
-index 000000000..9cb47ba62
+index 000000000000..9cb47ba62bc3
 --- /dev/null
 +++ b/fs/bcachefs/replicas.c
 @@ -0,0 +1,1073 @@
@@ -70519,7 +70960,7 @@ index 000000000..9cb47ba62
 +}
 diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
 new file mode 100644
-index 000000000..87820b2e1
+index 000000000000..87820b2e1ad3
 --- /dev/null
 +++ b/fs/bcachefs/replicas.h
 @@ -0,0 +1,106 @@
@@ -70631,7 +71072,7 @@ index 000000000..87820b2e1
 +#endif /* _BCACHEFS_REPLICAS_H */
 diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
 new file mode 100644
-index 000000000..0535b1d37
+index 000000000000..0535b1d3760e
 --- /dev/null
 +++ b/fs/bcachefs/replicas_types.h
 @@ -0,0 +1,10 @@
@@ -70647,7 +71088,7 @@ index 000000000..0535b1d37
 +#endif /* _BCACHEFS_REPLICAS_TYPES_H */
 diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
 new file mode 100644
-index 000000000..c062edb3f
+index 000000000000..c062edb3fbc2
 --- /dev/null
 +++ b/fs/bcachefs/siphash.c
 @@ -0,0 +1,173 @@
@@ -70826,7 +71267,7 @@ index 000000000..c062edb3f
 +}
 diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
 new file mode 100644
-index 000000000..3dfaf34a4
+index 000000000000..3dfaf34a43b2
 --- /dev/null
 +++ b/fs/bcachefs/siphash.h
 @@ -0,0 +1,87 @@
@@ -70919,7 +71360,7 @@ index 000000000..3dfaf34a4
 +#endif /* _SIPHASH_H_ */
 diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
 new file mode 100644
-index 000000000..591bbb9f8
+index 000000000000..591bbb9f8beb
 --- /dev/null
 +++ b/fs/bcachefs/str_hash.h
 @@ -0,0 +1,351 @@
@@ -71276,30 +71717,28 @@ index 000000000..591bbb9f8
 +#endif /* _BCACHEFS_STR_HASH_H */
 diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
 new file mode 100644
-index 000000000..60b60de83
+index 000000000000..b5b0f5e39f97
 --- /dev/null
 +++ b/fs/bcachefs/subvolume.c
-@@ -0,0 +1,1095 @@
+@@ -0,0 +1,1108 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "btree_key_cache.h"
 +#include "btree_update.h"
++#include "errcode.h"
 +#include "error.h"
 +#include "fs.h"
 +#include "subvolume.h"
 +
 +/* Snapshot tree: */
 +
-+static void bch2_delete_dead_snapshots_work(struct work_struct *);
-+static void bch2_delete_dead_snapshots(struct bch_fs *);
-+
 +void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 +			   struct bkey_s_c k)
 +{
 +	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
 +
-+	prt_printf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
++	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
 +	       BCH_SNAPSHOT_SUBVOL(s.v),
 +	       BCH_SNAPSHOT_DELETED(s.v),
 +	       le32_to_cpu(s.v->parent),
@@ -71416,7 +71855,7 @@ index 000000000..60b60de83
 +	if (!id)
 +		return 0;
 +
-+	ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
++	ret = snapshot_lookup(trans, id, &v);
 +	if (ret == -ENOENT)
 +		bch_err(trans->c, "snapshot node %u not found", id);
 +	if (ret)
@@ -71425,157 +71864,206 @@ index 000000000..60b60de83
 +	return !BCH_SNAPSHOT_DELETED(&v);
 +}
 +
-+static int bch2_snapshots_set_equiv(struct btree_trans *trans)
++static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
++	unsigned i, nr_live = 0, live_idx = 0;
 +	struct bkey_s_c_snapshot snap;
-+	unsigned i;
-+	int ret;
++	u32 id = k.k->p.offset, child[2];
 +
-+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
-+			   POS_MIN, 0, k, ret) {
-+		u32 id = k.k->p.offset, child[2];
-+		unsigned nr_live = 0, live_idx = 0;
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
 +
-+		if (k.k->type != KEY_TYPE_snapshot)
-+			continue;
++	snap = bkey_s_c_to_snapshot(k);
 +
-+		snap = bkey_s_c_to_snapshot(k);
-+		child[0] = le32_to_cpu(snap.v->children[0]);
-+		child[1] = le32_to_cpu(snap.v->children[1]);
++	child[0] = le32_to_cpu(snap.v->children[0]);
++	child[1] = le32_to_cpu(snap.v->children[1]);
 +
-+		for (i = 0; i < 2; i++) {
-+			ret = snapshot_live(trans, child[i]);
-+			if (ret < 0)
-+				goto err;
++	for (i = 0; i < 2; i++) {
++		int ret = snapshot_live(trans, child[i]);
++		if (ret < 0)
++			return ret;
 +
-+			if (ret)
-+				live_idx = i;
-+			nr_live += ret;
-+		}
-+
-+		snapshot_t(c, id)->equiv = nr_live == 1
-+			? snapshot_t(c, child[live_idx])->equiv
-+			: id;
++		if (ret)
++			live_idx = i;
++		nr_live += ret;
 +	}
-+err:
-+	bch2_trans_iter_exit(trans, &iter);
 +
-+	if (ret)
-+		bch_err(c, "error walking snapshots: %i", ret);
-+
-+	return ret;
++	snapshot_t(c, id)->equiv = nr_live == 1
++		? snapshot_t(c, child[live_idx])->equiv
++		: id;
++	return 0;
 +}
 +
 +/* fsck: */
-+static int bch2_snapshot_check(struct btree_trans *trans,
-+			       struct bkey_s_c_snapshot s)
++static int check_snapshot(struct btree_trans *trans,
++			  struct btree_iter *iter,
++			  struct bkey_s_c k)
 +{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c_snapshot s;
 +	struct bch_subvolume subvol;
 +	struct bch_snapshot v;
++	struct printbuf buf = PRINTBUF;
++	bool should_have_subvol;
 +	u32 i, id;
-+	int ret;
++	int ret = 0;
 +
-+	id = le32_to_cpu(s.v->subvol);
-+	ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
-+	if (ret == -ENOENT)
-+		bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
-+			s.k->p.offset, id);
-+	if (ret)
-+		return ret;
-+
-+	if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
-+		bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-+			s.k->p.offset);
-+		return -EINVAL;
-+	}
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
 +
++	s = bkey_s_c_to_snapshot(k);
 +	id = le32_to_cpu(s.v->parent);
 +	if (id) {
-+		ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
++		ret = snapshot_lookup(trans, id, &v);
 +		if (ret == -ENOENT)
-+			bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
-+				s.k->p.offset, id);
++			bch_err(c, "snapshot with nonexistent parent:\n  %s",
++				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
 +		if (ret)
-+			return ret;
++			goto err;
 +
 +		if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
 +		    le32_to_cpu(v.children[1]) != s.k->p.offset) {
-+			bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
++			bch_err(c, "snapshot parent %u missing pointer to child %llu",
 +				id, s.k->p.offset);
-+			return -EINVAL;
++			ret = -EINVAL;
++			goto err;
 +		}
 +	}
 +
 +	for (i = 0; i < 2 && s.v->children[i]; i++) {
 +		id = le32_to_cpu(s.v->children[i]);
 +
-+		ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
++		ret = snapshot_lookup(trans, id, &v);
 +		if (ret == -ENOENT)
-+			bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
++			bch_err(c, "snapshot node %llu has nonexistent child %u",
 +				s.k->p.offset, id);
 +		if (ret)
-+			return ret;
++			goto err;
 +
 +		if (le32_to_cpu(v.parent) != s.k->p.offset) {
-+			bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
++			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
 +				id, le32_to_cpu(v.parent), s.k->p.offset);
-+			return -EINVAL;
++			ret = -EINVAL;
++			goto err;
 +		}
 +	}
 +
++	should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
++		!BCH_SNAPSHOT_DELETED(s.v);
++
++	if (should_have_subvol) {
++		id = le32_to_cpu(s.v->subvol);
++		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
++		if (ret == -ENOENT)
++			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
++				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
++		if (ret)
++			goto err;
++
++		if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
++			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
++				s.k->p.offset);
++			ret = -EINVAL;
++			goto err;
++		}
++	} else {
++		if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n  %s",
++				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
++			struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
++
++			ret = PTR_ERR_OR_ZERO(u);
++			if (ret)
++				goto err;
++
++			bkey_reassemble(&u->k_i, s.s_c);
++			u->v.subvol = 0;
++			ret = bch2_trans_update(trans, iter, &u->k_i, 0);
++			if (ret)
++				goto err;
++		}
++	}
++
++	if (BCH_SNAPSHOT_DELETED(s.v))
++		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++err:
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_fs_check_snapshots(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
++			POS(BCACHEFS_ROOT_INO, 0),
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_snapshot(&trans, &iter, k));
++
++	if (ret)
++		bch_err(c, "error %i checking snapshots", ret);
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int check_subvol(struct btree_trans *trans,
++			struct btree_iter *iter,
++			struct bkey_s_c k)
++{
++	struct bkey_s_c_subvolume subvol;
++	struct bch_snapshot snapshot;
++	unsigned snapid;
++	int ret;
++
++	if (k.k->type != KEY_TYPE_subvolume)
++		return 0;
++
++	subvol = bkey_s_c_to_subvolume(k);
++	snapid = le32_to_cpu(subvol.v->snapshot);
++	ret = snapshot_lookup(trans, snapid, &snapshot);
++
++	if (ret == -ENOENT)
++		bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
++			k.k->p.offset, snapid);
++	if (ret)
++		return ret;
++
++	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
++		ret = bch2_subvolume_delete(trans, iter->pos.offset);
++		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			bch_err(trans->c, "error deleting subvolume %llu: %s",
++				iter->pos.offset, bch2_err_str(ret));
++		if (ret)
++			return ret;
++	}
++
 +	return 0;
 +}
 +
-+int bch2_fs_snapshots_check(struct bch_fs *c)
++int bch2_fs_check_subvols(struct bch_fs *c)
 +{
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	struct bch_snapshot s;
-+	unsigned id;
 +	int ret;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-+			   POS_MIN, 0, k, ret) {
-+		if (k.k->type != KEY_TYPE_snapshot)
-+			continue;
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_subvol(&trans, &iter, k));
 +
-+		ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	if (ret) {
-+		bch_err(c, "error %i checking snapshots", ret);
-+		goto err;
-+	}
-+
-+	for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
-+			   POS_MIN, 0, k, ret) {
-+		if (k.k->type != KEY_TYPE_subvolume)
-+			continue;
-+again_2:
-+		id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
-+		ret = snapshot_lookup(&trans, id, &s);
-+
-+		if (ret == -EINTR) {
-+			k = bch2_btree_iter_peek(&iter);
-+			goto again_2;
-+		} else if (ret == -ENOENT)
-+			bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
-+				k.k->p.offset, id);
-+		else if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+err:
 +	bch2_trans_exit(&trans);
++
 +	return ret;
 +}
 +
@@ -71589,49 +72077,19 @@ index 000000000..60b60de83
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	bool have_deleted = false;
 +	int ret = 0;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-+			   POS_MIN, 0, k, ret) {
-+	       if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
-+		       break;
++	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k,
++		bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
++		bch2_snapshot_set_equiv(&trans, k));
 +
-+		if (k.k->type != KEY_TYPE_snapshot) {
-+			bch_err(c, "found wrong key type %u in snapshot node table",
-+				k.k->type);
-+			continue;
-+		}
-+
-+		if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
-+			have_deleted = true;
-+
-+		ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	if (ret)
-+		goto err;
-+
-+	ret = bch2_snapshots_set_equiv(&trans);
-+	if (ret)
-+		goto err;
-+err:
 +	bch2_trans_exit(&trans);
 +
-+	if (!ret && have_deleted) {
-+		bch_info(c, "restarting deletion of dead snapshots");
-+		if (c->opts.fsck) {
-+			bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
-+		} else {
-+			bch2_delete_dead_snapshots(c);
-+		}
-+	}
-+
++	if (ret)
++		bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -71668,8 +72126,10 @@ index 000000000..60b60de83
 +		goto err;
 +
 +	bkey_reassemble(&s->k_i, k);
-+
 +	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
++	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
++	s->v.subvol = 0;
++
 +	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
 +	if (ret)
 +		goto err;
@@ -71833,6 +72293,7 @@ index 000000000..60b60de83
 +
 +		n->v.children[0] = cpu_to_le32(new_snapids[0]);
 +		n->v.children[1] = cpu_to_le32(new_snapids[1]);
++		n->v.subvol = 0;
 +		SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
 +		ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
 +		if (ret)
@@ -71843,126 +72304,100 @@ index 000000000..60b60de83
 +	return ret;
 +}
 +
-+static int snapshot_id_add(snapshot_id_list *s, u32 id)
-+{
-+	BUG_ON(snapshot_list_has_id(s, id));
-+
-+	return darray_push(s, id);
-+}
-+
-+static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
-+					   snapshot_id_list *deleted,
-+					   enum btree_id btree_id)
++static int snapshot_delete_key(struct btree_trans *trans,
++			       struct btree_iter *iter,
++			       struct bkey_s_c k,
++			       snapshot_id_list *deleted,
++			       snapshot_id_list *equiv_seen,
++			       struct bpos *last_pos)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	snapshot_id_list equiv_seen = { 0 };
-+	struct bpos last_pos = POS_MIN;
-+	int ret = 0;
++	u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
 +
-+	/*
-+	 * XXX: We should also delete whiteouts that no longer overwrite
-+	 * anything
-+	 */
++	if (bkey_cmp(k.k->p, *last_pos))
++		equiv_seen->nr = 0;
++	*last_pos = k.k->p;
 +
-+	bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
-+			     BTREE_ITER_INTENT|
-+			     BTREE_ITER_PREFETCH|
-+			     BTREE_ITER_NOT_EXTENTS|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+
-+	while ((bch2_trans_begin(trans),
-+		(k = bch2_btree_iter_peek(&iter)).k) &&
-+	       !(ret = bkey_err(k))) {
-+		u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
-+
-+		if (bkey_cmp(k.k->p, last_pos))
-+			equiv_seen.nr = 0;
-+		last_pos = k.k->p;
-+
-+		if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
-+		    snapshot_list_has_id(&equiv_seen, equiv)) {
-+			if (btree_id == BTREE_ID_inodes &&
-+			    bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
-+				continue;
-+
-+			ret = __bch2_trans_do(trans, NULL, NULL,
-+					      BTREE_INSERT_NOFAIL,
-+				bch2_btree_iter_traverse(&iter) ?:
-+				bch2_btree_delete_at(trans, &iter,
-+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
-+			if (ret)
-+				break;
-+		} else {
-+			ret = snapshot_id_add(&equiv_seen, equiv);
-+			if (ret)
-+				break;
-+		}
-+
-+		bch2_btree_iter_advance(&iter);
++	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
++	    snapshot_list_has_id(equiv_seen, equiv)) {
++		return bch2_btree_delete_at(trans, iter,
++					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++	} else {
++		return snapshot_list_add(c, equiv_seen, equiv);
 +	}
-+	bch2_trans_iter_exit(trans, &iter);
-+
-+	darray_exit(&equiv_seen);
-+
-+	return ret;
 +}
 +
-+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
++static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
++					  struct bkey_s_c k)
++{
++	struct bkey_s_c_snapshot snap;
++	u32 children[2];
++	int ret;
++
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
++
++	snap = bkey_s_c_to_snapshot(k);
++	if (BCH_SNAPSHOT_DELETED(snap.v) ||
++	    BCH_SNAPSHOT_SUBVOL(snap.v))
++		return 0;
++
++	children[0] = le32_to_cpu(snap.v->children[0]);
++	children[1] = le32_to_cpu(snap.v->children[1]);
++
++	ret   = snapshot_live(trans, children[0]) ?:
++		snapshot_live(trans, children[1]);
++	if (ret < 0)
++		return ret;
++
++	if (!ret)
++		return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
++	return 0;
++}
++
++int bch2_delete_dead_snapshots(struct bch_fs *c)
 +{
-+	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	struct bkey_s_c_snapshot snap;
 +	snapshot_id_list deleted = { 0 };
-+	u32 i, id, children[2];
++	u32 i, id;
 +	int ret = 0;
 +
++	if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
++		return 0;
++
++	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
++		ret = bch2_fs_read_write_early(c);
++		if (ret) {
++			bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
++			return ret;
++		}
++	}
++
 +	bch2_trans_init(&trans, c, 0, 0);
 +
 +	/*
 +	 * For every snapshot node: If we have no live children and it's not
 +	 * pointed to by a subvolume, delete it:
 +	 */
-+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-+			   POS_MIN, 0, k, ret) {
-+		if (k.k->type != KEY_TYPE_snapshot)
-+			continue;
-+
-+		snap = bkey_s_c_to_snapshot(k);
-+		if (BCH_SNAPSHOT_DELETED(snap.v) ||
-+		    BCH_SNAPSHOT_SUBVOL(snap.v))
-+			continue;
-+
-+		children[0] = le32_to_cpu(snap.v->children[0]);
-+		children[1] = le32_to_cpu(snap.v->children[1]);
-+
-+		ret   = snapshot_live(&trans, children[0]) ?:
-+			snapshot_live(&trans, children[1]);
-+		if (ret < 0)
-+			break;
-+		if (ret)
-+			continue;
-+
-+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-+			bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
-+		if (ret) {
-+			bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
-+			break;
-+		}
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
++			POS_MIN, 0, k,
++			NULL, NULL, 0,
++		bch2_delete_redundant_snapshot(&trans, &iter, k));
 +	if (ret) {
-+		bch_err(c, "error walking snapshots: %i", ret);
++		bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
-+	ret = bch2_snapshots_set_equiv(&trans);
-+	if (ret)
++	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k,
++		bch2_snapshot_set_equiv(&trans, k));
++	if (ret) {
++		bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
 +		goto err;
++	}
 +
 +	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
 +			   POS_MIN, 0, k, ret) {
@@ -71971,7 +72406,7 @@ index 000000000..60b60de83
 +
 +		snap = bkey_s_c_to_snapshot(k);
 +		if (BCH_SNAPSHOT_DELETED(snap.v)) {
-+			ret = snapshot_id_add(&deleted, k.k->p.offset);
++			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
 +			if (ret)
 +				break;
 +		}
@@ -71979,39 +72414,59 @@ index 000000000..60b60de83
 +	bch2_trans_iter_exit(&trans, &iter);
 +
 +	if (ret) {
-+		bch_err(c, "error walking snapshots: %i", ret);
++		bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
 +	for (id = 0; id < BTREE_ID_NR; id++) {
++		struct bpos last_pos = POS_MIN;
++		snapshot_id_list equiv_seen = { 0 };
++
 +		if (!btree_type_has_snapshots(id))
 +			continue;
 +
-+		ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
++		ret = for_each_btree_key_commit(&trans, iter,
++				id, POS_MIN,
++				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++				NULL, NULL, BTREE_INSERT_NOFAIL,
++			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
++
++		darray_exit(&equiv_seen);
++
 +		if (ret) {
-+			bch_err(c, "error deleting snapshot keys: %i", ret);
++			bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
 +
 +	for (i = 0; i < deleted.nr; i++) {
-+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++		ret = commit_do(&trans, NULL, NULL, 0,
 +			bch2_snapshot_node_delete(&trans, deleted.data[i]));
 +		if (ret) {
-+			bch_err(c, "error deleting snapshot %u: %i",
-+				deleted.data[i], ret);
++			bch_err(c, "error deleting snapshot %u: %s",
++				deleted.data[i], bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
++
++	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 +err:
 +	darray_exit(&deleted);
 +	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static void bch2_delete_dead_snapshots_work(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
++
++	bch2_delete_dead_snapshots(c);
 +	percpu_ref_put(&c->writes);
 +}
 +
-+static void bch2_delete_dead_snapshots(struct bch_fs *c)
++void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 +{
-+	if (unlikely(!percpu_ref_tryget_live(&c->writes)))
++	if (!percpu_ref_tryget_live(&c->writes))
 +		return;
 +
 +	if (!queue_work(system_long_wq, &c->snapshot_delete_work))
@@ -72021,7 +72476,14 @@ index 000000000..60b60de83
 +static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
 +					   struct btree_trans_commit_hook *h)
 +{
-+	bch2_delete_dead_snapshots(trans->c);
++	struct bch_fs *c = trans->c;
++
++	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++
++	if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
++		return 0;
++
++	bch2_delete_dead_snapshots_async(c);
 +	return 0;
 +}
 +
@@ -72112,7 +72574,6 @@ index 000000000..60b60de83
 +	struct bkey_s_c k;
 +	struct bkey_s_c_subvolume subvol;
 +	struct btree_trans_commit_hook *h;
-+	struct bkey_i *delete;
 +	u32 snapid;
 +	int ret = 0;
 +
@@ -72134,14 +72595,7 @@ index 000000000..60b60de83
 +	subvol = bkey_s_c_to_subvolume(k);
 +	snapid = le32_to_cpu(subvol.v->snapshot);
 +
-+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
-+	ret = PTR_ERR_OR_ZERO(delete);
-+	if (ret)
-+		goto err;
-+
-+	bkey_init(&delete->k);
-+	delete->k.p = iter.pos;
-+	ret = bch2_trans_update(trans, &iter, delete, 0);
++	ret = bch2_btree_delete_at(trans, &iter, 0);
 +	if (ret)
 +		goto err;
 +
@@ -72182,7 +72636,7 @@ index 000000000..60b60de83
 +			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
 +				      bch2_subvolume_delete(&trans, *id));
 +			if (ret) {
-+				bch_err(c, "error %i deleting subvolume %u", ret, *id);
++				bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
 +				break;
 +			}
 +		}
@@ -72207,7 +72661,7 @@ index 000000000..60b60de83
 +
 +	mutex_lock(&c->snapshots_unlinked_lock);
 +	if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
-+		ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
++		ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
 +	mutex_unlock(&c->snapshots_unlinked_lock);
 +
 +	if (ret)
@@ -72377,10 +72831,10 @@ index 000000000..60b60de83
 +}
 diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
 new file mode 100644
-index 000000000..b1739d29c
+index 000000000000..02a636644988
 --- /dev/null
 +++ b/fs/bcachefs/subvolume.h
-@@ -0,0 +1,126 @@
+@@ -0,0 +1,137 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_SUBVOLUME_H
 +#define _BCACHEFS_SUBVOLUME_H
@@ -72410,6 +72864,16 @@ index 000000000..b1739d29c
 +	return snapshot_t(c, id)->parent;
 +}
 +
++static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
++{
++	return snapshot_t(c, id)->equiv;
++}
++
++static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
++{
++	return id == snapshot_t(c, id)->equiv;
++}
++
 +static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
 +{
 +	struct snapshot_t *s = snapshot_t(c, id);
@@ -72441,31 +72905,6 @@ index 000000000..b1739d29c
 +	return id == ancestor;
 +}
 +
-+struct snapshots_seen {
-+	struct bpos			pos;
-+	DARRAY(u32)			ids;
-+};
-+
-+static inline void snapshots_seen_exit(struct snapshots_seen *s)
-+{
-+	kfree(s->ids.data);
-+	s->ids.data = NULL;
-+}
-+
-+static inline void snapshots_seen_init(struct snapshots_seen *s)
-+{
-+	memset(s, 0, sizeof(*s));
-+}
-+
-+static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
-+{
-+	int ret = darray_push(&s->ids, id);
-+	if (ret)
-+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
-+			s->ids.size);
-+	return ret;
-+}
-+
 +static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
 +{
 +	u32 *i;
@@ -72476,7 +72915,30 @@ index 000000000..b1739d29c
 +	return false;
 +}
 +
-+int bch2_fs_snapshots_check(struct bch_fs *);
++static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
++{
++	u32 *i;
++
++	darray_for_each(*s, i)
++		if (bch2_snapshot_is_ancestor(c, id, *i))
++			return true;
++	return false;
++}
++
++static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
++{
++	int ret;
++
++	BUG_ON(snapshot_list_has_id(s, id));
++	ret = darray_push(s, id);
++	if (ret)
++		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
++	return ret;
++}
++
++int bch2_fs_check_snapshots(struct bch_fs *);
++int bch2_fs_check_subvols(struct bch_fs *);
++
 +void bch2_fs_snapshots_exit(struct bch_fs *);
 +int bch2_fs_snapshots_start(struct bch_fs *);
 +
@@ -72499,6 +72961,9 @@ index 000000000..b1739d29c
 +int bch2_snapshot_node_create(struct btree_trans *, u32,
 +			      u32 *, u32 *, unsigned);
 +
++int bch2_delete_dead_snapshots(struct bch_fs *);
++void bch2_delete_dead_snapshots_async(struct bch_fs *);
++
 +int bch2_subvolume_delete(struct btree_trans *, u32);
 +int bch2_subvolume_unlink(struct btree_trans *, u32);
 +int bch2_subvolume_create(struct btree_trans *, u64, u32,
@@ -72509,7 +72974,7 @@ index 000000000..b1739d29c
 +#endif /* _BCACHEFS_SUBVOLUME_H */
 diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
 new file mode 100644
-index 000000000..f7562b5d5
+index 000000000000..f7562b5d51df
 --- /dev/null
 +++ b/fs/bcachefs/subvolume_types.h
 @@ -0,0 +1,9 @@
@@ -72524,7 +72989,7 @@ index 000000000..f7562b5d5
 +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
 diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
 new file mode 100644
-index 000000000..8b8130993
+index 000000000000..8b8130993a59
 --- /dev/null
 +++ b/fs/bcachefs/super-io.c
 @@ -0,0 +1,1602 @@
@@ -74132,7 +74597,7 @@ index 000000000..8b8130993
 +}
 diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
 new file mode 100644
-index 000000000..14a25f6fe
+index 000000000000..14a25f6fe29a
 --- /dev/null
 +++ b/fs/bcachefs/super-io.h
 @@ -0,0 +1,126 @@
@@ -74264,10 +74729,10 @@ index 000000000..14a25f6fe
 +#endif /* _BCACHEFS_SUPER_IO_H */
 diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
 new file mode 100644
-index 000000000..290897403
+index 000000000000..7c6348001ae3
 --- /dev/null
 +++ b/fs/bcachefs/super.c
-@@ -0,0 +1,1970 @@
+@@ -0,0 +1,1950 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * bcachefs setup/teardown code, and some metadata io - read a superblock and
@@ -74294,6 +74759,7 @@ index 000000000..290897403
 +#include "debug.h"
 +#include "disk_groups.h"
 +#include "ec.h"
++#include "errcode.h"
 +#include "error.h"
 +#include "fs.h"
 +#include "fs-io.h"
@@ -75200,31 +75666,10 @@ index 000000000..290897403
 +	up_write(&c->state_lock);
 +	return ret;
 +err:
-+	switch (ret) {
-+	case BCH_FSCK_ERRORS_NOT_FIXED:
-+		bch_err(c, "filesystem contains errors: please report this to the developers");
-+		pr_cont("mount with -o fix_errors to repair\n");
-+		break;
-+	case BCH_FSCK_REPAIR_UNIMPLEMENTED:
-+		bch_err(c, "filesystem contains errors: please report this to the developers");
-+		pr_cont("repair unimplemented: inform the developers so that it can be added\n");
-+		break;
-+	case BCH_FSCK_REPAIR_IMPOSSIBLE:
-+		bch_err(c, "filesystem contains errors, but repair impossible");
-+		break;
-+	case BCH_FSCK_UNKNOWN_VERSION:
-+		bch_err(c, "unknown metadata version");
-+		break;
-+	case -ENOMEM:
-+		bch_err(c, "cannot allocate memory");
-+		break;
-+	case -EIO:
-+		bch_err(c, "IO error");
-+		break;
-+	}
++	bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
 +
-+	if (ret >= 0)
-+		ret = -EIO;
++	if (ret < -BCH_ERR_START)
++		ret = -EINVAL;
 +	goto out;
 +}
 +
@@ -75708,7 +76153,7 @@ index 000000000..290897403
 +		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
 +					BTREE_TRIGGER_NORUN, NULL);
 +	if (ret)
-+		bch_err(c, "error %i removing dev alloc info", ret);
++		bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
 +
 +	return ret;
 +}
@@ -75736,7 +76181,7 @@ index 000000000..290897403
 +
 +	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
 +	if (ret) {
-+		bch_err(ca, "Remove failed: error %i dropping data", ret);
++		bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -75748,7 +76193,7 @@ index 000000000..290897403
 +
 +	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
 +	if (ret) {
-+		bch_err(ca, "Remove failed: error %i flushing journal", ret);
++		bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -75760,7 +76205,7 @@ index 000000000..290897403
 +
 +	ret = bch2_replicas_gc2(c);
 +	if (ret) {
-+		bch_err(ca, "Remove failed: error %i from replicas gc", ret);
++		bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -75824,7 +76269,7 @@ index 000000000..290897403
 +
 +	ret = bch2_read_super(path, &opts, &sb);
 +	if (ret) {
-+		bch_err(c, "device add error: error reading super: %i", ret);
++		bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -75917,13 +76362,13 @@ index 000000000..290897403
 +
 +	ret = bch2_trans_mark_dev_sb(c, ca);
 +	if (ret) {
-+		bch_err(c, "device add error: error marking new superblock: %i", ret);
++		bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
 +		goto err_late;
 +	}
 +
 +	ret = bch2_fs_freespace_init(c);
 +	if (ret) {
-+		bch_err(c, "device add error: error initializing free space: %i", ret);
++		bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
 +		goto err_late;
 +	}
 +
@@ -75985,8 +76430,8 @@ index 000000000..290897403
 +
 +	ret = bch2_trans_mark_dev_sb(c, ca);
 +	if (ret) {
-+		bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb",
-+			path, ret);
++		bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
++			path, bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -76055,7 +76500,7 @@ index 000000000..290897403
 +
 +	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
 +	if (ret) {
-+		bch_err(ca, "Resize error: %i", ret);
++		bch_err(ca, "Resize error: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -76240,7 +76685,7 @@ index 000000000..290897403
 +module_init(bcachefs_init);
 diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
 new file mode 100644
-index 000000000..8501adaff
+index 000000000000..8501adaff4c2
 --- /dev/null
 +++ b/fs/bcachefs/super.h
 @@ -0,0 +1,264 @@
@@ -76510,7 +76955,7 @@ index 000000000..8501adaff
 +#endif /* _BCACHEFS_SUPER_H */
 diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
 new file mode 100644
-index 000000000..89419fc79
+index 000000000000..89419fc7930d
 --- /dev/null
 +++ b/fs/bcachefs/super_types.h
 @@ -0,0 +1,51 @@
@@ -76567,7 +77012,7 @@ index 000000000..89419fc79
 +#endif /* _BCACHEFS_SUPER_TYPES_H */
 diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
 new file mode 100644
-index 000000000..2c650055f
+index 000000000000..2c650055f530
 --- /dev/null
 +++ b/fs/bcachefs/sysfs.c
 @@ -0,0 +1,943 @@
@@ -77516,7 +77961,7 @@ index 000000000..2c650055f
 +#endif  /* _BCACHEFS_SYSFS_H_ */
 diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
 new file mode 100644
-index 000000000..222cd5062
+index 000000000000..222cd5062702
 --- /dev/null
 +++ b/fs/bcachefs/sysfs.h
 @@ -0,0 +1,48 @@
@@ -77570,10 +78015,10 @@ index 000000000..222cd5062
 +#endif  /* _BCACHEFS_SYSFS_H_ */
 diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
 new file mode 100644
-index 000000000..1954891ce
+index 000000000000..56058a56f2a2
 --- /dev/null
 +++ b/fs/bcachefs/tests.c
-@@ -0,0 +1,947 @@
+@@ -0,0 +1,976 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifdef CONFIG_BCACHEFS_TESTS
 +
@@ -77618,29 +78063,29 @@ index 000000000..1954891ce
 +	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
 +			     BTREE_ITER_INTENT);
 +
-+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++	ret = commit_do(&trans, NULL, NULL, 0,
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 +	if (ret) {
-+		bch_err(c, "update error in test_delete: %i", ret);
++		bch_err(c, "update error in test_delete: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
 +	pr_info("deleting once");
-+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++	ret = commit_do(&trans, NULL, NULL, 0,
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_btree_delete_at(&trans, &iter, 0));
 +	if (ret) {
-+		bch_err(c, "delete error (first) in test_delete: %i", ret);
++		bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
 +	pr_info("deleting twice");
-+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++	ret = commit_do(&trans, NULL, NULL, 0,
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_btree_delete_at(&trans, &iter, 0));
 +	if (ret) {
-+		bch_err(c, "delete error (second) in test_delete: %i", ret);
++		bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +err:
@@ -77664,22 +78109,22 @@ index 000000000..1954891ce
 +	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
 +			     BTREE_ITER_INTENT);
 +
-+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++	ret = commit_do(&trans, NULL, NULL, 0,
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 +	if (ret) {
-+		bch_err(c, "update error in test_delete_written: %i", ret);
++		bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +
 +	bch2_trans_unlock(&trans);
 +	bch2_journal_flush_all_pins(&c->journal);
 +
-+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++	ret = commit_do(&trans, NULL, NULL, 0,
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_btree_delete_at(&trans, &iter, 0));
 +	if (ret) {
-+		bch_err(c, "delete error in test_delete_written: %i", ret);
++		bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret));
 +		goto err;
 +	}
 +err:
@@ -77712,7 +78157,7 @@ index 000000000..1954891ce
 +		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 +					NULL, NULL, 0);
 +		if (ret) {
-+			bch_err(c, "insert error in test_iterate: %i", ret);
++			bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
@@ -77721,20 +78166,30 @@ index 000000000..1954891ce
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-+			   SPOS(0, 0, U32_MAX), 0, k, ret) {
-+		if (k.k->p.inode)
-+			break;
-+
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
++				  SPOS(0, 0, U32_MAX), 0, k, ({
 +		BUG_ON(k.k->p.offset != i++);
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
++		goto err;
 +	}
 +
 +	BUG_ON(i != nr);
 +
 +	pr_info("iterating backwards");
 +
-+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
-+		BUG_ON(k.k->p.offset != --i);
++	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
++					 SPOS(0, U64_MAX, U32_MAX), 0, k,
++		({
++			BUG_ON(k.k->p.offset != --i);
++			0;
++		}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
++		goto err;
++	}
 +
 +	BUG_ON(i);
 +err:
@@ -77768,7 +78223,7 @@ index 000000000..1954891ce
 +		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 +					NULL, NULL, 0);
 +		if (ret) {
-+			bch_err(c, "insert error in test_iterate_extents: %i", ret);
++			bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
@@ -77777,19 +78232,31 @@ index 000000000..1954891ce
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-+			   SPOS(0, 0, U32_MAX), 0, k, ret) {
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
++				  SPOS(0, 0, U32_MAX), 0, k, ({
 +		BUG_ON(bkey_start_offset(k.k) != i);
 +		i = k.k->p.offset;
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
++		goto err;
 +	}
 +
 +	BUG_ON(i != nr);
 +
 +	pr_info("iterating backwards");
 +
-+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
-+		BUG_ON(k.k->p.offset != i);
-+		i = bkey_start_offset(k.k);
++	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
++					 SPOS(0, U64_MAX, U32_MAX), 0, k,
++		({
++			BUG_ON(k.k->p.offset != i);
++			i = bkey_start_offset(k.k);
++			0;
++		}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
++		goto err;
 +	}
 +
 +	BUG_ON(i);
@@ -77823,7 +78290,7 @@ index 000000000..1954891ce
 +		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 +					NULL, NULL, 0);
 +		if (ret) {
-+			bch_err(c, "insert error in test_iterate_slots: %i", ret);
++			bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
@@ -77832,15 +78299,16 @@ index 000000000..1954891ce
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-+			   SPOS(0, 0, U32_MAX), 0, k, ret) {
-+		if (k.k->p.inode)
-+			break;
-+
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
++				  SPOS(0, 0, U32_MAX), 0, k, ({
 +		BUG_ON(k.k->p.offset != i);
 +		i += 2;
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
++		goto err;
 +	}
-+	bch2_trans_iter_exit(&trans, &iter);
 +
 +	BUG_ON(i != nr * 2);
 +
@@ -77848,17 +78316,23 @@ index 000000000..1954891ce
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-+			   SPOS(0, 0, U32_MAX),
-+			   BTREE_ITER_SLOTS, k, ret) {
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
++				  SPOS(0, 0, U32_MAX),
++				  BTREE_ITER_SLOTS, k, ({
++		if (i >= nr * 2)
++			break;
++
 +		BUG_ON(k.k->p.offset != i);
 +		BUG_ON(bkey_deleted(k.k) != (i & 1));
 +
 +		i++;
-+		if (i == nr * 2)
-+			break;
++		0;
++	}));
++	if (ret < 0) {
++		bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
++		goto err;
 +	}
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = 0;
 +err:
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -77889,7 +78363,7 @@ index 000000000..1954891ce
 +		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 +					NULL, NULL, 0);
 +		if (ret) {
-+			bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
++			bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
@@ -77898,13 +78372,17 @@ index 000000000..1954891ce
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-+			   SPOS(0, 0, U32_MAX), 0, k, ret) {
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
++				  SPOS(0, 0, U32_MAX), 0, k, ({
 +		BUG_ON(bkey_start_offset(k.k) != i + 8);
 +		BUG_ON(k.k->size != 8);
 +		i += 16;
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
++		goto err;
 +	}
-+	bch2_trans_iter_exit(&trans, &iter);
 +
 +	BUG_ON(i != nr);
 +
@@ -77912,19 +78390,23 @@ index 000000000..1954891ce
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-+			   SPOS(0, 0, U32_MAX),
-+			   BTREE_ITER_SLOTS, k, ret) {
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
++				 SPOS(0, 0, U32_MAX),
++				 BTREE_ITER_SLOTS, k, ({
++		if (i == nr)
++			break;
 +		BUG_ON(bkey_deleted(k.k) != !(i % 16));
 +
 +		BUG_ON(bkey_start_offset(k.k) != i);
 +		BUG_ON(k.k->size != 8);
 +		i = k.k->p.offset;
-+
-+		if (i == nr)
-+			break;
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
++		goto err;
 +	}
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = 0;
 +err:
 +	bch2_trans_exit(&trans);
 +	return 0;
@@ -77944,10 +78426,10 @@ index 000000000..1954891ce
 +	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
 +			     SPOS(0, 0, U32_MAX), 0);
 +
-+	k = bch2_btree_iter_peek(&iter);
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 +	BUG_ON(k.k);
 +
-+	k = bch2_btree_iter_peek(&iter);
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 +	BUG_ON(k.k);
 +
 +	bch2_trans_iter_exit(&trans, &iter);
@@ -77965,10 +78447,10 @@ index 000000000..1954891ce
 +	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 +			     SPOS(0, 0, U32_MAX), 0);
 +
-+	k = bch2_btree_iter_peek(&iter);
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 +	BUG_ON(k.k);
 +
-+	k = bch2_btree_iter_peek(&iter);
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 +	BUG_ON(k.k);
 +
 +	bch2_trans_iter_exit(&trans, &iter);
@@ -77995,7 +78477,7 @@ index 000000000..1954891ce
 +	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 +				NULL, NULL, 0);
 +	if (ret)
-+		bch_err(c, "insert error in insert_test_extent: %i", ret);
++		bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -78058,7 +78540,7 @@ index 000000000..1954891ce
 +	bch2_trans_init(&trans, c, 0, 0);
 +	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
 +			     SPOS(0, 0, snapid_lo), 0);
-+	k = bch2_btree_iter_peek(&iter);
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 +
 +	BUG_ON(k.k->p.snapshot != U32_MAX);
 +
@@ -78094,7 +78576,7 @@ index 000000000..1954891ce
 +
 +	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
 +	if (ret) {
-+		bch_err(c, "err %i from test_snapshot_filter", ret);
++		bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret));
 +		return ret;
 +	}
 +
@@ -78128,10 +78610,10 @@ index 000000000..1954891ce
 +		k.k.p.offset = test_rand();
 +		k.k.p.snapshot = U32_MAX;
 +
-+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++		ret = commit_do(&trans, NULL, NULL, 0,
 +			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
 +		if (ret) {
-+			bch_err(c, "error in rand_insert: %i", ret);
++			bch_err(c, "error in rand_insert: %s", bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -78157,7 +78639,7 @@ index 000000000..1954891ce
 +			k[j].k.p.snapshot = U32_MAX;
 +		}
 +
-+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++		ret = commit_do(&trans, NULL, NULL, 0,
 +			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
 +			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
 +			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
@@ -78167,7 +78649,7 @@ index 000000000..1954891ce
 +			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
 +			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
 +		if (ret) {
-+			bch_err(c, "error in rand_insert_multi: %i", ret);
++			bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -78191,10 +78673,10 @@ index 000000000..1954891ce
 +	for (i = 0; i < nr; i++) {
 +		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 +
-+		k = bch2_btree_iter_peek(&iter);
++		lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 +		ret = bkey_err(k);
 +		if (ret) {
-+			bch_err(c, "error in rand_lookup: %i", ret);
++			bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -78214,10 +78696,10 @@ index 000000000..1954891ce
 +
 +	bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
 +
-+	k = bch2_btree_iter_peek(iter);
++	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(iter)));
 +	ret = bkey_err(k);
-+	if (ret && ret != -EINTR)
-+		bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret));
 +	if (ret)
 +		return ret;
 +
@@ -78244,10 +78726,10 @@ index 000000000..1954891ce
 +
 +	for (i = 0; i < nr; i++) {
 +		rand = test_rand();
-+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++		ret = commit_do(&trans, NULL, NULL, 0,
 +			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
 +		if (ret) {
-+			bch_err(c, "update error in rand_mixed: %i", ret);
++			bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -78265,7 +78747,7 @@ index 000000000..1954891ce
 +
 +	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
 +			     BTREE_ITER_INTENT);
-+	k = bch2_btree_iter_peek(&iter);
++	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 +	ret = bkey_err(k);
 +	if (ret)
 +		goto err;
@@ -78290,10 +78772,10 @@ index 000000000..1954891ce
 +	for (i = 0; i < nr; i++) {
 +		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 +
-+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++		ret = commit_do(&trans, NULL, NULL, 0,
 +			__do_delete(&trans, pos));
 +		if (ret) {
-+			bch_err(c, "error in rand_delete: %i", ret);
++			bch_err(c, "error in rand_delete: %s", bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -78309,28 +78791,23 @@ index 000000000..1954891ce
 +	struct bkey_s_c k;
 +	struct bkey_i_cookie insert;
 +	int ret = 0;
-+	u64 i = 0;
 +
 +	bkey_cookie_init(&insert.k_i);
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
-+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-+		insert.k.p = iter.pos;
-+
-+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-+			bch2_btree_iter_traverse(&iter) ?:
-+			bch2_trans_update(&trans, &iter, &insert.k_i, 0));
-+		if (ret) {
-+			bch_err(c, "error in seq_insert: %i", ret);
-+			break;
-+		}
-+
-+		if (++i == nr)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
++					SPOS(0, 0, U32_MAX),
++					BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
++					NULL, NULL, 0,
++		({
++			if (iter.pos.offset >= nr)
++				break;
++			insert.k.p = iter.pos;
++			bch2_trans_update(&trans, &iter, &insert.k_i, 0);
++		}));
++	if (ret)
++		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -78345,10 +78822,11 @@ index 000000000..1954891ce
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-+			   SPOS(0, 0, U32_MAX), 0, k, ret)
-+		;
-+	bch2_trans_iter_exit(&trans, &iter);
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
++				  SPOS(0, 0, U32_MAX), 0, k,
++		0);
++	if (ret)
++		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -78363,22 +78841,18 @@ index 000000000..1954891ce
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-+			   SPOS(0, 0, U32_MAX),
-+			   BTREE_ITER_INTENT, k, ret) {
-+		struct bkey_i_cookie u;
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
++					SPOS(0, 0, U32_MAX),
++					BTREE_ITER_INTENT, k,
++					NULL, NULL, 0,
++		({
++			struct bkey_i_cookie u;
 +
-+		bkey_reassemble(&u.k_i, k);
-+
-+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-+			bch2_btree_iter_traverse(&iter) ?:
-+			bch2_trans_update(&trans, &iter, &u.k_i, 0));
-+		if (ret) {
-+			bch_err(c, "error in seq_overwrite: %i", ret);
-+			break;
-+		}
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
++			bkey_reassemble(&u.k_i, k);
++			bch2_trans_update(&trans, &iter, &u.k_i, 0);
++		}));
++	if (ret)
++		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -78392,7 +78866,7 @@ index 000000000..1954891ce
 +				      SPOS(0, 0, U32_MAX), SPOS_MAX,
 +				      0, NULL);
 +	if (ret)
-+		bch_err(c, "error in seq_delete: %i", ret);
++		bch_err(c, "error in seq_delete: %s", bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -78429,7 +78903,7 @@ index 000000000..1954891ce
 +
 +	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
 +	if (ret) {
-+		bch_err(j->c, "%ps: error %i", j->fn, ret);
++		bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
 +		j->ret = ret;
 +	}
 +
@@ -78523,7 +78997,7 @@ index 000000000..1954891ce
 +#endif /* CONFIG_BCACHEFS_TESTS */
 diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
 new file mode 100644
-index 000000000..c73b18aea
+index 000000000000..c73b18aea7e0
 --- /dev/null
 +++ b/fs/bcachefs/tests.h
 @@ -0,0 +1,15 @@
@@ -78544,7 +79018,7 @@ index 000000000..c73b18aea
 +#endif /* _BCACHEFS_TEST_H */
 diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
 new file mode 100644
-index 000000000..59e8dfa3d
+index 000000000000..59e8dfa3d245
 --- /dev/null
 +++ b/fs/bcachefs/trace.c
 @@ -0,0 +1,12 @@
@@ -78562,10 +79036,10 @@ index 000000000..59e8dfa3d
 +#include <trace/events/bcachefs.h>
 diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
 new file mode 100644
-index 000000000..8ef4b5915
+index 000000000000..ee2c7d9e7050
 --- /dev/null
 +++ b/fs/bcachefs/util.c
-@@ -0,0 +1,958 @@
+@@ -0,0 +1,964 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * random utiility code, for bcache but in theory not specific to bcache
@@ -78944,31 +79418,37 @@ index 000000000..8ef4b5915
 +	u64 q, last_q = 0;
 +	int i;
 +
-+	prt_printf(out, "count:\t\t%llu\n",
++	prt_printf(out, "count:\t\t%llu",
 +			 stats->count);
-+	prt_printf(out, "rate:\t\t%llu/sec\n",
++	prt_newline(out);
++	prt_printf(out, "rate:\t\t%llu/sec",
 +	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
++	prt_newline(out);
 +
 +	prt_printf(out, "frequency:\t");
 +	pr_time_units(out, freq);
 +
-+	prt_printf(out, "\navg duration:\t");
++	prt_newline(out);
++	prt_printf(out, "avg duration:\t");
 +	pr_time_units(out, stats->average_duration);
 +
-+	prt_printf(out, "\nmax duration:\t");
++	prt_newline(out);
++	prt_printf(out, "max duration:\t");
 +	pr_time_units(out, stats->max_duration);
 +
 +	i = eytzinger0_first(NR_QUANTILES);
 +	u = pick_time_units(stats->quantiles.entries[i].m);
 +
-+	prt_printf(out, "\nquantiles (%s):\t", u->name);
++	prt_newline(out);
++	prt_printf(out, "quantiles (%s):\t", u->name);
 +	eytzinger0_for_each(i, NR_QUANTILES) {
 +		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
 +
 +		q = max(stats->quantiles.entries[i].m, last_q);
-+		prt_printf(out, "%llu%s",
-+		       div_u64(q, u->nsecs),
-+		       is_last ? "\n" : " ");
++		prt_printf(out, "%llu ",
++		       div_u64(q, u->nsecs));
++		if (is_last)
++			prt_newline(out);
 +		last_q = q;
 +	}
 +}
@@ -79526,7 +80006,7 @@ index 000000000..8ef4b5915
 +}
 diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
 new file mode 100644
-index 000000000..1fe66fd91
+index 000000000000..1fe66fd91ccc
 --- /dev/null
 +++ b/fs/bcachefs/util.h
 @@ -0,0 +1,783 @@
@@ -80315,7 +80795,7 @@ index 000000000..1fe66fd91
 +#endif /* _BCACHEFS_UTIL_H */
 diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
 new file mode 100644
-index 000000000..5143b603b
+index 000000000000..5143b603bf67
 --- /dev/null
 +++ b/fs/bcachefs/varint.c
 @@ -0,0 +1,121 @@
@@ -80442,7 +80922,7 @@ index 000000000..5143b603b
 +}
 diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
 new file mode 100644
-index 000000000..92a182fb3
+index 000000000000..92a182fb3d7a
 --- /dev/null
 +++ b/fs/bcachefs/varint.h
 @@ -0,0 +1,11 @@
@@ -80459,7 +80939,7 @@ index 000000000..92a182fb3
 +#endif /* _BCACHEFS_VARINT_H */
 diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
 new file mode 100644
-index 000000000..53a694d71
+index 000000000000..53a694d71967
 --- /dev/null
 +++ b/fs/bcachefs/vstructs.h
 @@ -0,0 +1,63 @@
@@ -80528,7 +81008,7 @@ index 000000000..53a694d71
 +#endif /* _VSTRUCTS_H */
 diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
 new file mode 100644
-index 000000000..123612716
+index 000000000000..186ffab542d5
 --- /dev/null
 +++ b/fs/bcachefs/xattr.c
 @@ -0,0 +1,648 @@
@@ -80878,7 +81358,7 @@ index 000000000..123612716
 +	offset = iter.pos.offset;
 +	bch2_trans_iter_exit(&trans, &iter);
 +err:
-+	if (ret == -EINTR)
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +		goto retry;
 +
 +	bch2_trans_exit(&trans);
@@ -81182,7 +81662,7 @@ index 000000000..123612716
 +}
 diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
 new file mode 100644
-index 000000000..66d7a1e30
+index 000000000000..66d7a1e30350
 --- /dev/null
 +++ b/fs/bcachefs/xattr.h
 @@ -0,0 +1,50 @@
@@ -81237,7 +81717,7 @@ index 000000000..66d7a1e30
 +
 +#endif /* _BCACHEFS_XATTR_H */
 diff --git a/fs/d_path.c b/fs/d_path.c
-index e4e0ebad1..1bd9e85f2 100644
+index e4e0ebad1f15..1bd9e85f2f65 100644
 --- a/fs/d_path.c
 +++ b/fs/d_path.c
 @@ -5,6 +5,7 @@
@@ -81290,7 +81770,7 @@ index e4e0ebad1..1bd9e85f2 100644
   * Helper function for dentry_operations.d_dname() members
   */
 diff --git a/fs/dcache.c b/fs/dcache.c
-index 93f4f5ee0..d90ed65e2 100644
+index 93f4f5ee07bf..d90ed65e2a75 100644
 --- a/fs/dcache.c
 +++ b/fs/dcache.c
 @@ -3193,9 +3193,8 @@ void d_genocide(struct dentry *parent)
@@ -81319,7 +81799,7 @@ index 93f4f5ee0..d90ed65e2 100644
  }
  EXPORT_SYMBOL(d_tmpfile);
 diff --git a/fs/inode.c b/fs/inode.c
-index bd4da9c52..ac0da28a1 100644
+index bd4da9c5207e..ac0da28a1ac6 100644
 --- a/fs/inode.c
 +++ b/fs/inode.c
 @@ -56,8 +56,23 @@
@@ -81806,7 +82286,7 @@ index bd4da9c52..ac0da28a1 100644
  					14,
  					HASH_ZERO,
 diff --git a/include/linux/bio.h b/include/linux/bio.h
-index 00450fd86..c11103a87 100644
+index 00450fd86bb4..c11103a8720a 100644
 --- a/include/linux/bio.h
 +++ b/include/linux/bio.h
 @@ -483,7 +483,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
@@ -81824,7 +82304,7 @@ index 00450fd86..c11103a87 100644
  static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
  {
 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
-index 108e3d114..20f76bd27 100644
+index 108e3d114bfc..20f76bd27b9a 100644
 --- a/include/linux/blkdev.h
 +++ b/include/linux/blkdev.h
 @@ -873,6 +873,7 @@ extern const char *blk_op_str(unsigned int op);
@@ -81839,7 +82319,7 @@ diff --git a/drivers/md/bcache/closure.h b/include/linux/closure.h
 similarity index 94%
 rename from drivers/md/bcache/closure.h
 rename to include/linux/closure.h
-index c88cdc4ae..36b4a83f9 100644
+index c88cdc4ae4ec..36b4a83f9b77 100644
 --- a/drivers/md/bcache/closure.h
 +++ b/include/linux/closure.h
 @@ -155,7 +155,7 @@ struct closure {
@@ -81947,7 +82427,7 @@ index c88cdc4ae..36b4a83f9 100644
 +
  #endif /* _LINUX_CLOSURE_H */
 diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
-index 445e80517..57e7d0b94 100644
+index 445e80517cab..57e7d0b94119 100644
 --- a/include/linux/compiler_attributes.h
 +++ b/include/linux/compiler_attributes.h
 @@ -371,4 +371,9 @@
@@ -81961,7 +82441,7 @@ index 445e80517..57e7d0b94 100644
 +
  #endif /* __LINUX_COMPILER_ATTRIBUTES_H */
 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
-index f5bba5148..6c661059a 100644
+index f5bba51480b2..6c661059a55b 100644
 --- a/include/linux/dcache.h
 +++ b/include/linux/dcache.h
 @@ -248,6 +248,7 @@ extern struct dentry * d_make_root(struct inode *);
@@ -81981,7 +82461,7 @@ index f5bba5148..6c661059a 100644
  /* Allocation counts.. */
  
 diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
-index fe848901f..5a3cc0e1d 100644
+index fe848901fcc3..5a3cc0e1da9b 100644
 --- a/include/linux/exportfs.h
 +++ b/include/linux/exportfs.h
 @@ -98,6 +98,12 @@ enum fid_type {
@@ -81998,7 +82478,7 @@ index fe848901f..5a3cc0e1d 100644
  	 * 128 bit child FID (struct lu_fid)
  	 * 128 bit parent FID (struct lu_fid)
 diff --git a/include/linux/fs.h b/include/linux/fs.h
-index bbde95387..98f62ebf9 100644
+index bbde95387a23..98f62ebf9224 100644
 --- a/include/linux/fs.h
 +++ b/include/linux/fs.h
 @@ -637,7 +637,8 @@ struct inode {
@@ -82039,7 +82519,7 @@ index bbde95387..98f62ebf9 100644
  }
  
 diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
-index 107613f7d..c74b73769 100644
+index 107613f7d792..c74b7376990d 100644
 --- a/include/linux/generic-radix-tree.h
 +++ b/include/linux/generic-radix-tree.h
 @@ -38,6 +38,7 @@
@@ -82150,7 +82630,7 @@ index 107613f7d..c74b73769 100644
  
  /**
 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
-index fe6efb24d..9ba5a53c6 100644
+index fe6efb24d151..9ba5a53c6ad5 100644
 --- a/include/linux/kernel.h
 +++ b/include/linux/kernel.h
 @@ -202,11 +202,17 @@ static inline void might_fault(void) { }
@@ -82185,7 +82665,7 @@ index fe6efb24d..9ba5a53c6 100644
  
  /*
 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
-index ae1b54144..8ee2bf5af 100644
+index ae1b541446c9..8ee2bf5af131 100644
 --- a/include/linux/list_bl.h
 +++ b/include/linux/list_bl.h
 @@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
@@ -82218,7 +82698,7 @@ index ae1b54144..8ee2bf5af 100644
  {
  	bit_spin_lock(0, (unsigned long *)b);
 diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
-index 467b94257..c46b0c76c 100644
+index 467b94257105..c46b0c76c064 100644
 --- a/include/linux/lockdep.h
 +++ b/include/linux/lockdep.h
 @@ -336,6 +336,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
@@ -82241,7 +82721,7 @@ index 467b94257..c46b0c76c 100644
  enum xhlock_context_t {
 diff --git a/include/linux/pretty-printers.h b/include/linux/pretty-printers.h
 new file mode 100644
-index 000000000..f39d8edfb
+index 000000000000..f39d8edfba02
 --- /dev/null
 +++ b/include/linux/pretty-printers.h
 @@ -0,0 +1,10 @@
@@ -82257,7 +82737,7 @@ index 000000000..f39d8edfb
 +#endif /* _LINUX_PRETTY_PRINTERS_H */
 diff --git a/include/linux/printbuf.h b/include/linux/printbuf.h
 new file mode 100644
-index 000000000..861c5d75f
+index 000000000000..861c5d75f852
 --- /dev/null
 +++ b/include/linux/printbuf.h
 @@ -0,0 +1,283 @@
@@ -82545,7 +83025,7 @@ index 000000000..861c5d75f
 +
 +#endif /* _LINUX_PRINTBUF_H */
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index a8911b1f3..252bac976 100644
+index a8911b1f35aa..252bac976763 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -859,6 +859,7 @@ struct task_struct {
@@ -82558,7 +83038,7 @@ index a8911b1f3..252bac976 100644
  	struct vmacache			vmacache;
 diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
 deleted file mode 100644
-index 5b31c5147..000000000
+index 5b31c5147969..000000000000
 --- a/include/linux/seq_buf.h
 +++ /dev/null
 @@ -1,162 +0,0 @@
@@ -82725,7 +83205,7 @@ index 5b31c5147..000000000
 -
 -#endif /* _LINUX_SEQ_BUF_H */
 diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
-index 76fbf92b0..12967748f 100644
+index 76fbf92b04d9..12967748f9f7 100644
 --- a/include/linux/shrinker.h
 +++ b/include/linux/shrinker.h
 @@ -2,6 +2,8 @@
@@ -82768,7 +83248,7 @@ index 76fbf92b0..12967748f 100644
  #endif
 diff --git a/include/linux/six.h b/include/linux/six.h
 new file mode 100644
-index 000000000..477c33eb0
+index 000000000000..477c33eb00d7
 --- /dev/null
 +++ b/include/linux/six.h
 @@ -0,0 +1,203 @@
@@ -82976,7 +83456,7 @@ index 000000000..477c33eb0
 +
 +#endif /* _LINUX_SIX_H */
 diff --git a/include/linux/string.h b/include/linux/string.h
-index b6572aeca..0a737d5b9 100644
+index b6572aeca2f5..0a737d5b9203 100644
 --- a/include/linux/string.h
 +++ b/include/linux/string.h
 @@ -195,7 +195,12 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s);
@@ -82993,7 +83473,7 @@ index b6572aeca..0a737d5b9 100644
  int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
  int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
 diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
-index 4d72258d4..52e0f1d28 100644
+index 4d72258d42fd..52e0f1d283b9 100644
 --- a/include/linux/string_helpers.h
 +++ b/include/linux/string_helpers.h
 @@ -10,6 +10,7 @@
@@ -83033,7 +83513,7 @@ index 4d72258d4..52e0f1d28 100644
  		unsigned int flags, const char *only)
  {
 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
-index e6e95a9f0..48471e32f 100644
+index e6e95a9f07a5..48471e32f8e4 100644
 --- a/include/linux/trace_events.h
 +++ b/include/linux/trace_events.h
 @@ -496,7 +496,7 @@ struct dynevent_cmd;
@@ -83046,7 +83526,7 @@ index e6e95a9f0..48471e32f 100644
  	unsigned int		n_fields;
  	enum dynevent_type	type;
 diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
-index 5a2c650d9..d2b51007b 100644
+index 5a2c650d9e1c..d2b51007b3b9 100644
 --- a/include/linux/trace_seq.h
 +++ b/include/linux/trace_seq.h
 @@ -2,10 +2,12 @@
@@ -83118,7 +83598,7 @@ index 5a2c650d9..d2b51007b 100644
  
  extern void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
 diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
-index b159c2789..0f4151e98 100644
+index b159c2789961..0f4151e98331 100644
 --- a/include/linux/vmalloc.h
 +++ b/include/linux/vmalloc.h
 @@ -144,6 +144,7 @@ extern void *vzalloc(unsigned long size) __alloc_size(1);
@@ -83129,12 +83609,92 @@ index b159c2789..0f4151e98 100644
  extern void *vmalloc_32(unsigned long size) __alloc_size(1);
  extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
  extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
+index 24a509f559ee..0b20ee6854d6 100644
+--- a/include/net/9p/9p.h
++++ b/include/net/9p/9p.h
+@@ -539,12 +539,12 @@ struct p9_rstatfs {
+ struct p9_fcall {
+ 	u32 size;
+ 	u8 id;
++	bool used_mempool;
+ 	u16 tag;
+ 
+ 	size_t offset;
+ 	size_t capacity;
+ 
+-	struct kmem_cache *cache;
+ 	u8 *sdata;
+ };
+ 
+diff --git a/include/net/9p/client.h b/include/net/9p/client.h
+index ec1d1706f43c..832dcc866a20 100644
+--- a/include/net/9p/client.h
++++ b/include/net/9p/client.h
+@@ -9,6 +9,7 @@
+ #ifndef NET_9P_CLIENT_H
+ #define NET_9P_CLIENT_H
+ 
++#include <linux/mempool.h>
+ #include <linux/utsname.h>
+ #include <linux/idr.h>
+ 
+@@ -76,7 +77,7 @@ enum p9_req_status_t {
+ struct p9_req_t {
+ 	int status;
+ 	int t_err;
+-	struct kref refcount;
++	refcount_t refcount;
+ 	wait_queue_head_t wq;
+ 	struct p9_fcall tc;
+ 	struct p9_fcall rc;
+@@ -107,6 +108,14 @@ struct p9_client {
+ 	void *trans;
+ 	struct kmem_cache *fcall_cache;
+ 
++	/*
++	 * We need two identical mempools because it's not safe to allocate
++	 * multiple elements from the same pool (without freeing the first);
++	 * that will deadlock if multiple threads need the last element at the
++	 * same time.
++	 */
++	mempool_t pools[2];
++
+ 	union {
+ 		struct {
+ 			int rfd;
+@@ -222,20 +231,21 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
+ 				kgid_t gid, struct p9_qid *qid);
+ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
+ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
+-void p9_fcall_fini(struct p9_fcall *fc);
++void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
++		   int fc_idx);
+ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag);
+ 
+ static inline void p9_req_get(struct p9_req_t *r)
+ {
+-	kref_get(&r->refcount);
++	refcount_inc(&r->refcount);
+ }
+ 
+ static inline int p9_req_try_get(struct p9_req_t *r)
+ {
+-	return kref_get_unless_zero(&r->refcount);
++	return refcount_inc_not_zero(&r->refcount);
+ }
+ 
+-int p9_req_put(struct p9_req_t *r);
++int p9_req_put(struct p9_client *c, struct p9_req_t *r);
+ 
+ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status);
+ 
 diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
 new file mode 100644
-index 000000000..66ad356e9
+index 000000000000..140834e7406e
 --- /dev/null
 +++ b/include/trace/events/bcachefs.h
-@@ -0,0 +1,1020 @@
+@@ -0,0 +1,1048 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#undef TRACE_SYSTEM
 +#define TRACE_SYSTEM bcachefs
@@ -83447,24 +84007,27 @@ index 000000000..66ad356e9
 +);
 +
 +TRACE_EVENT(btree_reserve_get_fail,
-+	TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
-+	TP_ARGS(c, required, cl),
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 size_t required),
++	TP_ARGS(trans_fn, caller_ip, required),
 +
 +	TP_STRUCT__entry(
-+		__field(dev_t,		dev			)
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
 +		__field(size_t,			required	)
-+		__field(struct closure *,	cl		)
 +	),
 +
 +	TP_fast_assign(
-+		__entry->dev		= c->dev;
-+		__entry->required = required;
-+		__entry->cl = cl;
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip	= caller_ip;
++		__entry->required	= required;
 +	),
 +
-+	TP_printk("%d,%d required %zu by %p",
-+		  MAJOR(__entry->dev), MINOR(__entry->dev),
-+		  __entry->required, __entry->cl)
++	TP_printk("%s %pS required %zu",
++		  __entry->trans_fn,
++		  (void *) __entry->caller_ip,
++		  __entry->required)
 +);
 +
 +DEFINE_EVENT(btree_node, btree_split,
@@ -83593,55 +84156,68 @@ index 000000000..66ad356e9
 +
 +TRACE_EVENT(bucket_alloc_fail,
 +	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
++		 u64 free,
 +		 u64 avail,
++		 u64 copygc_wait_amount,
++		 s64 copygc_waiting_for,
 +		 u64 seen,
 +		 u64 open,
 +		 u64 need_journal_commit,
 +		 u64 nouse,
 +		 bool nonblocking,
-+		 int ret),
-+	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret),
++		 const char *err),
++	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
++		seen, open, need_journal_commit, nouse, nonblocking, err),
 +
 +	TP_STRUCT__entry(
 +		__field(dev_t,			dev			)
 +		__array(char,	reserve,	16			)
++		__field(u64,			free			)
 +		__field(u64,			avail			)
++		__field(u64,			copygc_wait_amount	)
++		__field(s64,			copygc_waiting_for	)
 +		__field(u64,			seen			)
 +		__field(u64,			open			)
 +		__field(u64,			need_journal_commit	)
 +		__field(u64,			nouse			)
 +		__field(bool,			nonblocking		)
-+		__field(int,			ret			)
++		__array(char,			err,	16		)
 +	),
 +
 +	TP_fast_assign(
 +		__entry->dev		= ca->dev;
 +		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
++		__entry->free		= free;
 +		__entry->avail		= avail;
++		__entry->copygc_wait_amount	= copygc_wait_amount;
++		__entry->copygc_waiting_for	= copygc_waiting_for;
 +		__entry->seen		= seen;
 +		__entry->open		= open;
 +		__entry->need_journal_commit = need_journal_commit;
 +		__entry->nouse		= nouse;
 +		__entry->nonblocking	= nonblocking;
-+		__entry->ret		= ret;
++		strlcpy(__entry->err, err, sizeof(__entry->err));
 +	),
 +
-+	TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i",
++	TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
 +		  MAJOR(__entry->dev), MINOR(__entry->dev),
 +		  __entry->reserve,
++		  __entry->free,
 +		  __entry->avail,
++		  __entry->copygc_wait_amount,
++		  __entry->copygc_waiting_for,
 +		  __entry->seen,
 +		  __entry->open,
 +		  __entry->need_journal_commit,
 +		  __entry->nouse,
 +		  __entry->nonblocking,
-+		  __entry->ret)
++		  __entry->err)
 +);
 +
 +TRACE_EVENT(discard_buckets,
 +	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-+		 u64 need_journal_commit, u64 discarded, int ret),
-+	TP_ARGS(c, seen, open, need_journal_commit, discarded, ret),
++		 u64 need_journal_commit, u64 discarded, const char *err),
++	TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
 +
 +	TP_STRUCT__entry(
 +		__field(dev_t,		dev			)
@@ -83649,7 +84225,7 @@ index 000000000..66ad356e9
 +		__field(u64,		open			)
 +		__field(u64,		need_journal_commit	)
 +		__field(u64,		discarded		)
-+		__field(int,		ret			)
++		__array(char,		err,	16		)
 +	),
 +
 +	TP_fast_assign(
@@ -83658,16 +84234,16 @@ index 000000000..66ad356e9
 +		__entry->open			= open;
 +		__entry->need_journal_commit	= need_journal_commit;
 +		__entry->discarded		= discarded;
-+		__entry->ret			= ret;
++		strlcpy(__entry->err, err, sizeof(__entry->err));
 +	),
 +
-+	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i",
++	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
 +		  MAJOR(__entry->dev), MINOR(__entry->dev),
 +		  __entry->seen,
 +		  __entry->open,
 +		  __entry->need_journal_commit,
 +		  __entry->discarded,
-+		  __entry->ret)
++		  __entry->err)
 +);
 +
 +TRACE_EVENT(invalidate_bucket,
@@ -83815,6 +84391,12 @@ index 000000000..66ad356e9
 +	TP_ARGS(trans_fn, caller_ip)
 +);
 +
++DEFINE_EVENT(transaction_event,	transaction_restart_injected,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
 +DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
 +	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip),
@@ -83863,6 +84445,12 @@ index 000000000..66ad356e9
 +	TP_ARGS(trans_fn, caller_ip)
 +);
 +
++DEFINE_EVENT(transaction_event,	trans_restart_too_many_iters,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
 +DECLARE_EVENT_CLASS(transaction_restart_iter,
 +	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip,
@@ -84156,7 +84744,7 @@ index 000000000..66ad356e9
 +/* This part must be outside protection */
 +#include <trace/define_trace.h>
 diff --git a/init/init_task.c b/init/init_task.c
-index 73cc8f035..3e3aed110 100644
+index 73cc8f03511a..3e3aed110153 100644
 --- a/init/init_task.c
 +++ b/init/init_task.c
 @@ -85,6 +85,7 @@ struct task_struct init_task
@@ -84168,7 +84756,7 @@ index 73cc8f035..3e3aed110 100644
  		.fn = do_no_restart_syscall,
  	},
 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
-index 4198f0273..b2abd9a5d 100644
+index 4198f0273ecd..b2abd9a5d9ab 100644
 --- a/kernel/Kconfig.locks
 +++ b/kernel/Kconfig.locks
 @@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB
@@ -84179,7 +84767,7 @@ index 4198f0273..b2abd9a5d 100644
 +config SIXLOCKS
 +	bool
 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
-index d51cabf28..cadbf6520 100644
+index d51cabf28f38..cadbf6520c4b 100644
 --- a/kernel/locking/Makefile
 +++ b/kernel/locking/Makefile
 @@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
@@ -84188,7 +84776,7 @@ index d51cabf28..cadbf6520 100644
  obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
 +obj-$(CONFIG_SIXLOCKS) += six.o
 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
-index c06cab654..9426050d3 100644
+index c06cab6546ed..9426050d30d9 100644
 --- a/kernel/locking/lockdep.c
 +++ b/kernel/locking/lockdep.c
 @@ -6459,6 +6459,26 @@ void debug_check_no_locks_held(void)
@@ -84220,7 +84808,7 @@ index c06cab654..9426050d3 100644
  {
 diff --git a/kernel/locking/six.c b/kernel/locking/six.c
 new file mode 100644
-index 000000000..fca120872
+index 000000000000..fca1208720b6
 --- /dev/null
 +++ b/kernel/locking/six.c
 @@ -0,0 +1,759 @@
@@ -84984,7 +85572,7 @@ index 000000000..fca120872
 +}
 +EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
 diff --git a/kernel/module.c b/kernel/module.c
-index 6529c84c5..df4959bda 100644
+index 6529c84c536f..df4959bda595 100644
 --- a/kernel/module.c
 +++ b/kernel/module.c
 @@ -2834,9 +2834,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
@@ -84998,8 +85586,28 @@ index 6529c84c5..df4959bda 100644
  }
  
  bool __weak module_init_section(const char *name)
+diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
+index 9ed5ce989415..3428568bb3f1 100644
+--- a/kernel/stacktrace.c
++++ b/kernel/stacktrace.c
+@@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
+ 	put_task_stack(tsk);
+ 	return c.len;
+ }
++EXPORT_SYMBOL(stack_trace_save_tsk);
+ 
+ /**
+  * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+@@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
+ 	save_stack_trace_tsk(task, &trace);
+ 	return trace.nr_entries;
+ }
++EXPORT_SYMBOL(stack_trace_save_tsk);
+ 
+ /**
+  * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
-index 114c31bdf..7c7fd7b66 100644
+index c0c98b0c86e7..9d91153228fd 100644
 --- a/kernel/trace/trace.c
 +++ b/kernel/trace/trace.c
 @@ -1672,15 +1672,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
@@ -85119,7 +85727,7 @@ index 114c31bdf..7c7fd7b66 100644
  	printk(KERN_TRACE "%s", s->buffer);
  
 diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
-index e34e8182e..eabeeb97b 100644
+index e34e8182ee4b..eabeeb97b55e 100644
 --- a/kernel/trace/trace_dynevent.c
 +++ b/kernel/trace/trace_dynevent.c
 @@ -295,21 +295,19 @@ int dynevent_arg_add(struct dynevent_cmd *cmd,
@@ -85209,7 +85817,7 @@ index e34e8182e..eabeeb97b 100644
  	cmd->run_command = run_command;
  }
 diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
-index b458a9afa..70cfd1241 100644
+index b458a9afa2c0..70cfd1241018 100644
 --- a/kernel/trace/trace_events_filter.c
 +++ b/kernel/trace/trace_events_filter.c
 @@ -1059,7 +1059,7 @@ static void append_filter_err(struct trace_array *tr,
@@ -85222,7 +85830,7 @@ index b458a9afa..70cfd1241 100644
  		kfree(filter->filter_string);
  		filter->filter_string = buf;
 diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
-index 5e8c07aef..ddb2a2737 100644
+index 5e8c07aef071..ddb2a2737b82 100644
 --- a/kernel/trace/trace_events_synth.c
 +++ b/kernel/trace/trace_events_synth.c
 @@ -5,13 +5,14 @@
@@ -85303,7 +85911,7 @@ index 5e8c07aef..ddb2a2737 100644
  		return ret;
  
 diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
-index 203204cad..9f270fdde 100644
+index 203204cadf92..9f270fdde99b 100644
 --- a/kernel/trace/trace_functions_graph.c
 +++ b/kernel/trace/trace_functions_graph.c
 @@ -1022,9 +1022,9 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
@@ -85320,7 +85928,7 @@ index 203204cad..9f270fdde 100644
  
  	trace_seq_puts(s, " */\n");
 diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
-index 134397432..6e4485b04 100644
+index 13439743285c..6e4485b042d8 100644
 --- a/kernel/trace/trace_kprobe.c
 +++ b/kernel/trace/trace_kprobe.c
 @@ -915,7 +915,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command)
@@ -85333,7 +85941,7 @@ index 134397432..6e4485b04 100644
  
  /**
 diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
-index 9c90b3a7d..48c08f29c 100644
+index 9c90b3a7dce2..48c08f29c342 100644
 --- a/kernel/trace/trace_seq.c
 +++ b/kernel/trace/trace_seq.c
 @@ -25,11 +25,9 @@
@@ -85612,7 +86220,7 @@ index 9c90b3a7d..48c08f29c 100644
  		return 0;
  	}
 diff --git a/lib/Kconfig b/lib/Kconfig
-index 55f0bba8f..9161ac314 100644
+index 55f0bba8f8c0..9161ac314358 100644
 --- a/lib/Kconfig
 +++ b/lib/Kconfig
 @@ -491,6 +491,9 @@ config ASSOCIATIVE_ARRAY
@@ -85626,7 +86234,7 @@ index 55f0bba8f..9161ac314 100644
  	bool
  	depends on !NO_IOMEM
 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
-index 7e2829701..2bef39841 100644
+index 7e282970177a..2bef39841f8e 100644
 --- a/lib/Kconfig.debug
 +++ b/lib/Kconfig.debug
 @@ -1723,6 +1723,15 @@ config DEBUG_CREDENTIALS
@@ -85646,7 +86254,7 @@ index 7e2829701..2bef39841 100644
  	bool "Force round-robin CPU selection for unbound work items"
  	depends on DEBUG_KERNEL
 diff --git a/lib/Makefile b/lib/Makefile
-index 60843ab66..d98f3c92b 100644
+index 60843ab661ba..d98f3c92badb 100644
 --- a/lib/Makefile
 +++ b/lib/Makefile
 @@ -30,11 +30,11 @@ endif
@@ -85677,7 +86285,7 @@ diff --git a/drivers/md/bcache/closure.c b/lib/closure.c
 similarity index 88%
 rename from drivers/md/bcache/closure.c
 rename to lib/closure.c
-index d8d9394a6..b38ded00b 100644
+index d8d9394a6beb..b38ded00b9b0 100644
 --- a/drivers/md/bcache/closure.c
 +++ b/lib/closure.c
 @@ -6,13 +6,12 @@
@@ -85790,8 +86398,17 @@ index d8d9394a6..b38ded00b 100644
 -MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
 -MODULE_LICENSE("GPL");
 +#endif
+diff --git a/lib/errname.c b/lib/errname.c
+index 05cbf731545f..82ea4778f478 100644
+--- a/lib/errname.c
++++ b/lib/errname.c
+@@ -222,3 +222,4 @@ const char *errname(int err)
+ 
+ 	return err > 0 ? name + 1 : name;
+ }
++EXPORT_SYMBOL(errname);
 diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
-index f25eb111c..41f1bcdc4 100644
+index f25eb111c051..41f1bcdc4488 100644
 --- a/lib/generic-radix-tree.c
 +++ b/lib/generic-radix-tree.c
 @@ -1,4 +1,5 @@
@@ -85898,7 +86515,7 @@ index f25eb111c..41f1bcdc4 100644
  {
  	if (level) {
 diff --git a/lib/hexdump.c b/lib/hexdump.c
-index 06833d404..9556f15ad 100644
+index 06833d404398..9556f15ad295 100644
 --- a/lib/hexdump.c
 +++ b/lib/hexdump.c
 @@ -9,6 +9,7 @@
@@ -86205,7 +86822,7 @@ index 06833d404..9556f15ad 100644
  	unsigned char linebuf[32 * 3 + 2 + 32 + 1];
 diff --git a/lib/pretty-printers.c b/lib/pretty-printers.c
 new file mode 100644
-index 000000000..addbac95e
+index 000000000000..addbac95e065
 --- /dev/null
 +++ b/lib/pretty-printers.c
 @@ -0,0 +1,60 @@
@@ -86271,7 +86888,7 @@ index 000000000..addbac95e
 +EXPORT_SYMBOL(prt_bitflags);
 diff --git a/lib/printbuf.c b/lib/printbuf.c
 new file mode 100644
-index 000000000..047470025
+index 000000000000..047470025748
 --- /dev/null
 +++ b/lib/printbuf.c
 @@ -0,0 +1,258 @@
@@ -86535,7 +87152,7 @@ index 000000000..047470025
 +EXPORT_SYMBOL(prt_units_s64);
 diff --git a/lib/seq_buf.c b/lib/seq_buf.c
 deleted file mode 100644
-index 0a68f7aa8..000000000
+index 0a68f7aa85d6..000000000000
 --- a/lib/seq_buf.c
 +++ /dev/null
 @@ -1,397 +0,0 @@
@@ -86937,7 +87554,7 @@ index 0a68f7aa8..000000000
 -	return 0;
 -}
 diff --git a/lib/string_helpers.c b/lib/string_helpers.c
-index 5ed3beb06..d247bf945 100644
+index 5ed3beb066e6..d247bf945f16 100644
 --- a/lib/string_helpers.c
 +++ b/lib/string_helpers.c
 @@ -15,6 +15,7 @@
@@ -87277,7 +87894,7 @@ index 5ed3beb06..d247bf945 100644
  EXPORT_SYMBOL(string_escape_mem);
  
 diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c
-index 5144899d3..f9e97879d 100644
+index 5144899d3c6b..f9e97879dcdf 100644
 --- a/lib/test_hexdump.c
 +++ b/lib/test_hexdump.c
 @@ -25,36 +25,19 @@ static const char * const test_data_1[] __initconst = {
@@ -87344,7 +87961,7 @@ index 5144899d3..f9e97879d 100644
  		result = test_data_1;
  
 diff --git a/lib/test_printf.c b/lib/test_printf.c
-index 07309c45f..ac5f9f0eb 100644
+index 07309c45f327..ac5f9f0eb4e0 100644
 --- a/lib/test_printf.c
 +++ b/lib/test_printf.c
 @@ -9,6 +9,7 @@
@@ -87409,7 +88026,7 @@ index 07309c45f..ac5f9f0eb 100644
  	kfree(alloced_buffer);
  }
 diff --git a/lib/vsprintf.c b/lib/vsprintf.c
-index 40d26a07a..dfca8a7c9 100644
+index 40d26a07a133..dfca8a7c93ed 100644
 --- a/lib/vsprintf.c
 +++ b/lib/vsprintf.c
 @@ -44,6 +44,7 @@
@@ -90226,7 +90843,7 @@ index 40d26a07a..dfca8a7c9 100644
  EXPORT_SYMBOL_GPL(bstr_printf);
  
 diff --git a/mm/Makefile b/mm/Makefile
-index 4cc13f317..7e852599b 100644
+index 4cc13f3179a5..7e852599b917 100644
 --- a/mm/Makefile
 +++ b/mm/Makefile
 @@ -54,7 +54,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
@@ -90239,7 +90856,7 @@ index 4cc13f317..7e852599b 100644
  # Give 'page_alloc' its own module-parameter namespace
  page-alloc-y := page_alloc.o
 diff --git a/mm/filemap.c b/mm/filemap.c
-index be1859a27..222bcfe7a 100644
+index be1859a276e1..222bcfe7afa0 100644
 --- a/mm/filemap.c
 +++ b/mm/filemap.c
 @@ -2223,6 +2223,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
@@ -90251,7 +90868,7 @@ index be1859a27..222bcfe7a 100644
  /**
   * find_get_pages_contig - gang contiguous pagecache lookup
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
-index 598fece89..57861dc9f 100644
+index 598fece89e2b..57861dc9fee5 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -62,7 +62,7 @@
@@ -90360,7 +90977,7 @@ index 598fece89..57861dc9f 100644
  
  #define K(x) ((x) << (PAGE_SHIFT-10))
 diff --git a/mm/nommu.c b/mm/nommu.c
-index 9d7afc2d9..dd5302026 100644
+index 9d7afc2d959e..dd53020262d8 100644
 --- a/mm/nommu.c
 +++ b/mm/nommu.c
 @@ -281,6 +281,24 @@ void *vzalloc_node(unsigned long size, int node)
@@ -90389,7 +91006,7 @@ index 9d7afc2d9..dd5302026 100644
   * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
   *	@size:		allocation size
 diff --git a/mm/oom_kill.c b/mm/oom_kill.c
-index 49d7df39b..9c550a283 100644
+index 49d7df39b02d..9c550a283037 100644
 --- a/mm/oom_kill.c
 +++ b/mm/oom_kill.c
 @@ -168,27 +168,6 @@ static bool oom_unkillable_task(struct task_struct *p)
@@ -90433,7 +91050,7 @@ diff --git a/lib/show_mem.c b/mm/show_mem.c
 similarity index 83%
 rename from lib/show_mem.c
 rename to mm/show_mem.c
-index 1c26c14ff..47225158c 100644
+index 1c26c14ffbb9..47225158ce49 100644
 --- a/lib/show_mem.c
 +++ b/mm/show_mem.c
 @@ -7,6 +7,9 @@
@@ -90457,7 +91074,7 @@ index 1c26c14ff..47225158c 100644
 +	printk("%pf()", CALL_PP(shrinkers_to_text));
  }
 diff --git a/mm/slab.h b/mm/slab.h
-index 95eb34174..a91fc5aa1 100644
+index 95eb34174c1b..a91fc5aa1054 100644
 --- a/mm/slab.h
 +++ b/mm/slab.h
 @@ -805,10 +805,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
@@ -90476,7 +91093,7 @@ index 95eb34174..a91fc5aa1 100644
  }
  #endif
 diff --git a/mm/slab_common.c b/mm/slab_common.c
-index 2b3206a2c..333f431e0 100644
+index 2b3206a2c3b5..333f431e0708 100644
 --- a/mm/slab_common.c
 +++ b/mm/slab_common.c
 @@ -24,6 +24,7 @@
@@ -90568,7 +91185,7 @@ index 2b3206a2c..333f431e0 100644
  }
  
 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
-index cadfbb515..60456a184 100644
+index cadfbb5155ea..60456a184b6a 100644
 --- a/mm/vmalloc.c
 +++ b/mm/vmalloc.c
 @@ -3363,6 +3363,27 @@ void *vzalloc_node(unsigned long size, int node)
@@ -90600,7 +91217,7 @@ index cadfbb515..60456a184 100644
  #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
  #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
 diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 1678802e0..d911c5e33 100644
+index 1678802e03e7..d911c5e3304e 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -50,6 +50,7 @@
@@ -90718,8 +91335,330 @@ index 1678802e0..d911c5e33 100644
  
  		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
  		total_scan -= shrinkctl->nr_scanned;
+diff --git a/net/9p/client.c b/net/9p/client.c
+index 8bba0d9cf975..e14074d031c6 100644
+--- a/net/9p/client.c
++++ b/net/9p/client.c
+@@ -218,23 +218,29 @@ static int parse_opts(char *opts, struct p9_client *clnt)
+ 	return ret;
+ }
+ 
+-static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
+-			 int alloc_msize)
++static void p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
++			  int fc_idx, unsigned alloc_msize)
+ {
+-	if (likely(c->fcall_cache) && alloc_msize == c->msize) {
+-		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
+-		fc->cache = c->fcall_cache;
+-	} else {
+-		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
+-		fc->cache = NULL;
+-	}
+-	if (!fc->sdata)
+-		return -ENOMEM;
++	gfp_t gfp = GFP_NOFS|__GFP_NOWARN;
++
++	BUG_ON(alloc_msize > c->msize);
++
++	fc->sdata = NULL;
++	fc->used_mempool = false;
+ 	fc->capacity = alloc_msize;
+-	return 0;
++
++	if (alloc_msize < c->msize)
++		fc->sdata = kmalloc(alloc_msize, gfp);
++
++	if (!fc->sdata) {
++		fc->sdata = mempool_alloc(&c->pools[fc_idx], gfp);
++		fc->used_mempool = true;
++		fc->capacity = c->msize;
++	}
+ }
+ 
+-void p9_fcall_fini(struct p9_fcall *fc)
++void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
++		   int fc_idx)
+ {
+ 	/* sdata can be NULL for interrupted requests in trans_rdma,
+ 	 * and kmem_cache_free does not do NULL-check for us
+@@ -242,8 +248,8 @@ void p9_fcall_fini(struct p9_fcall *fc)
+ 	if (unlikely(!fc->sdata))
+ 		return;
+ 
+-	if (fc->cache)
+-		kmem_cache_free(fc->cache, fc->sdata);
++	if (fc->used_mempool)
++		mempool_free(fc->sdata, &c->pools[fc_idx]);
+ 	else
+ 		kfree(fc->sdata);
+ }
+@@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
+ 	if (!req)
+ 		return ERR_PTR(-ENOMEM);
+ 
+-	if (p9_fcall_init(c, &req->tc, alloc_msize))
+-		goto free_req;
+-	if (p9_fcall_init(c, &req->rc, alloc_msize))
+-		goto free;
++	p9_fcall_init(c, &req->tc, 0, alloc_msize);
++	p9_fcall_init(c, &req->rc, 1, alloc_msize);
+ 
+ 	p9pdu_reset(&req->tc);
+ 	p9pdu_reset(&req->rc);
+@@ -305,14 +309,13 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
+ 	 * callback), so p9_client_cb eats the second ref there
+ 	 * as the pointer is duplicated directly by virtqueue_add_sgs()
+ 	 */
+-	refcount_set(&req->refcount.refcount, 2);
++	refcount_set(&req->refcount, 2);
+ 
+ 	return req;
+ 
+ free:
+-	p9_fcall_fini(&req->tc);
+-	p9_fcall_fini(&req->rc);
+-free_req:
++	p9_fcall_fini(c, &req->tc, 0);
++	p9_fcall_fini(c, &req->rc, 1);
+ 	kmem_cache_free(p9_req_cache, req);
+ 	return ERR_PTR(-ENOMEM);
+ }
+@@ -341,7 +344,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
+ 		if (!p9_req_try_get(req))
+ 			goto again;
+ 		if (req->tc.tag != tag) {
+-			p9_req_put(req);
++			p9_req_put(c, req);
+ 			goto again;
+ 		}
+ 	}
+@@ -367,21 +370,18 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
+ 	spin_lock_irqsave(&c->lock, flags);
+ 	idr_remove(&c->reqs, tag);
+ 	spin_unlock_irqrestore(&c->lock, flags);
+-	return p9_req_put(r);
+-}
+-
+-static void p9_req_free(struct kref *ref)
+-{
+-	struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount);
+-
+-	p9_fcall_fini(&r->tc);
+-	p9_fcall_fini(&r->rc);
+-	kmem_cache_free(p9_req_cache, r);
++	return p9_req_put(c, r);
+ }
+ 
+-int p9_req_put(struct p9_req_t *r)
++int p9_req_put(struct p9_client *c, struct p9_req_t *r)
+ {
+-	return kref_put(&r->refcount, p9_req_free);
++	if (refcount_dec_and_test(&r->refcount)) {
++		p9_fcall_fini(c, &r->tc, 0);
++		p9_fcall_fini(c, &r->rc, 1);
++		kmem_cache_free(p9_req_cache, r);
++		return 1;
++	}
++	return 0;
+ }
+ EXPORT_SYMBOL(p9_req_put);
+ 
+@@ -426,7 +426,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
+ 
+ 	wake_up(&req->wq);
+ 	p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
+-	p9_req_put(req);
++	p9_req_put(c, req);
+ }
+ EXPORT_SYMBOL(p9_client_cb);
+ 
+@@ -709,7 +709,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
+ reterr:
+ 	p9_tag_remove(c, req);
+ 	/* We have to put also the 2nd reference as it won't be used */
+-	p9_req_put(req);
++	p9_req_put(c, req);
+ 	return ERR_PTR(err);
+ }
+ 
+@@ -746,7 +746,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
+ 	err = c->trans_mod->request(c, req);
+ 	if (err < 0) {
+ 		/* write won't happen */
+-		p9_req_put(req);
++		p9_req_put(c, req);
+ 		if (err != -ERESTARTSYS && err != -EFAULT)
+ 			c->status = Disconnected;
+ 		goto recalc_sigpending;
+@@ -1002,7 +1002,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
+ 	char *client_id;
+ 
+ 	err = 0;
+-	clnt = kmalloc(sizeof(*clnt), GFP_KERNEL);
++	clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
+ 	if (!clnt)
+ 		return ERR_PTR(-ENOMEM);
+ 
+@@ -1053,10 +1053,6 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
+ 		goto close_trans;
+ 	}
+ 
+-	err = p9_client_version(clnt);
+-	if (err)
+-		goto close_trans;
+-
+ 	/* P9_HDRSZ + 4 is the smallest packet header we can have that is
+ 	 * followed by data accessed from userspace by read
+ 	 */
+@@ -1066,6 +1062,15 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
+ 					   clnt->msize - (P9_HDRSZ + 4),
+ 					   NULL);
+ 
++	err =   mempool_init_slab_pool(&clnt->pools[0], 4, clnt->fcall_cache) ?:
++		mempool_init_slab_pool(&clnt->pools[1], 4, clnt->fcall_cache);
++	if (err)
++		goto close_trans;
++
++	err = p9_client_version(clnt);
++	if (err)
++		goto close_trans;
++
+ 	return clnt;
+ 
+ close_trans:
+@@ -1073,6 +1078,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
+ put_trans:
+ 	v9fs_put_trans(clnt->trans_mod);
+ free_client:
++	mempool_exit(&clnt->pools[1]);
++	mempool_exit(&clnt->pools[0]);
+ 	kfree(clnt);
+ 	return ERR_PTR(err);
+ }
+@@ -1097,6 +1104,8 @@ void p9_client_destroy(struct p9_client *clnt)
+ 
+ 	p9_tag_cleanup(clnt);
+ 
++	mempool_exit(&clnt->pools[1]);
++	mempool_exit(&clnt->pools[0]);
+ 	kmem_cache_destroy(clnt->fcall_cache);
+ 	kfree(clnt);
+ }
+diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
+index 8f8f95e39b03..007c3f45fe05 100644
+--- a/net/9p/trans_fd.c
++++ b/net/9p/trans_fd.c
+@@ -378,7 +378,7 @@ static void p9_read_work(struct work_struct *work)
+ 		m->rc.sdata = NULL;
+ 		m->rc.offset = 0;
+ 		m->rc.capacity = 0;
+-		p9_req_put(m->rreq);
++		p9_req_put(m->client, m->rreq);
+ 		m->rreq = NULL;
+ 	}
+ 
+@@ -492,7 +492,7 @@ static void p9_write_work(struct work_struct *work)
+ 	m->wpos += err;
+ 	if (m->wpos == m->wsize) {
+ 		m->wpos = m->wsize = 0;
+-		p9_req_put(m->wreq);
++		p9_req_put(m->client, m->wreq);
+ 		m->wreq = NULL;
+ 	}
+ 
+@@ -695,7 +695,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
+ 	if (req->status == REQ_STATUS_UNSENT) {
+ 		list_del(&req->req_list);
+ 		req->status = REQ_STATUS_FLSHD;
+-		p9_req_put(req);
++		p9_req_put(client, req);
+ 		ret = 0;
+ 	}
+ 	spin_unlock(&client->lock);
+@@ -722,7 +722,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
+ 	list_del(&req->req_list);
+ 	req->status = REQ_STATUS_FLSHD;
+ 	spin_unlock(&client->lock);
+-	p9_req_put(req);
++	p9_req_put(client, req);
+ 
+ 	return 0;
+ }
+@@ -883,12 +883,12 @@ static void p9_conn_destroy(struct p9_conn *m)
+ 	p9_mux_poll_stop(m);
+ 	cancel_work_sync(&m->rq);
+ 	if (m->rreq) {
+-		p9_req_put(m->rreq);
++		p9_req_put(m->client, m->rreq);
+ 		m->rreq = NULL;
+ 	}
+ 	cancel_work_sync(&m->wq);
+ 	if (m->wreq) {
+-		p9_req_put(m->wreq);
++		p9_req_put(m->client, m->wreq);
+ 		m->wreq = NULL;
+ 	}
+ 
+diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
+index 88e563826674..99d878d70d56 100644
+--- a/net/9p/trans_rdma.c
++++ b/net/9p/trans_rdma.c
+@@ -350,7 +350,7 @@ send_done(struct ib_cq *cq, struct ib_wc *wc)
+ 			    c->busa, c->req->tc.size,
+ 			    DMA_TO_DEVICE);
+ 	up(&rdma->sq_sem);
+-	p9_req_put(c->req);
++	p9_req_put(client, c->req);
+ 	kfree(c);
+ }
+ 
+@@ -431,7 +431,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
+ 	if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
+ 		if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
+ 			/* Got one! */
+-			p9_fcall_fini(&req->rc);
++			p9_fcall_fini(client, &req->rc, 1);
+ 			req->rc.sdata = NULL;
+ 			goto dont_need_post_recv;
+ 		} else {
+diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
+index b24a4fb0f0a2..147972bf2e79 100644
+--- a/net/9p/trans_virtio.c
++++ b/net/9p/trans_virtio.c
+@@ -199,7 +199,7 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
+ /* Reply won't come, so drop req ref */
+ static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
+ {
+-	p9_req_put(req);
++	p9_req_put(client, req);
+ 	return 0;
+ }
+ 
+@@ -523,7 +523,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
+ 	kvfree(out_pages);
+ 	if (!kicked) {
+ 		/* reply won't come */
+-		p9_req_put(req);
++		p9_req_put(client, req);
+ 	}
+ 	return err;
+ }
+diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
+index 77883b6788cd..4cf0c78d4d22 100644
+--- a/net/9p/trans_xen.c
++++ b/net/9p/trans_xen.c
+@@ -163,7 +163,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
+ 	ring->intf->out_prod = prod;
+ 	spin_unlock_irqrestore(&ring->lock, flags);
+ 	notify_remote_via_irq(ring->irq);
+-	p9_req_put(p9_req);
++	p9_req_put(client, p9_req);
+ 
+ 	return 0;
+ }
 diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c
-index 4d1a94736..a2097955d 100644
+index 4d1a947367f9..a2097955dace 100644
 --- a/tools/testing/nvdimm/test/ndtest.c
 +++ b/tools/testing/nvdimm/test/ndtest.c
 @@ -12,7 +12,7 @@
@@ -90774,5 +91713,5 @@ index 4d1a94736..a2097955d 100644
  static DEVICE_ATTR_RO(flags);
  
 -- 
-2.37.0.rc0.15.g3b9a5a33c2
+2.37.1
 
diff --git a/linux-tkg-patches/5.19/0008-5.19-bcachefs.patch b/linux-tkg-patches/5.19/0008-5.19-bcachefs.patch
new file mode 100644
index 0000000..9673d36
--- /dev/null
+++ b/linux-tkg-patches/5.19/0008-5.19-bcachefs.patch
@@ -0,0 +1,91722 @@
+From db2079ca2ce5000c3dfe3656c1c7f580b053a325 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 22 Jul 2022 14:23:53 +0200
+Subject: [PATCH] 5.19-bcachefs
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ .github/ISSUE_TEMPLATE/bug_report.md          |   61 +
+ Documentation/core-api/printk-formats.rst     |   22 +
+ arch/powerpc/kernel/process.c                 |   16 +-
+ arch/powerpc/kernel/security.c                |   75 +-
+ arch/powerpc/platforms/pseries/papr_scm.c     |   34 +-
+ arch/x86/kernel/cpu/resctrl/rdtgroup.c        |   16 +-
+ block/bio.c                                   |   34 +-
+ block/blk-core.c                              |    1 +
+ block/blk.h                                   |    1 -
+ drivers/acpi/apei/erst-dbg.c                  |    1 +
+ drivers/block/loop.c                          |    2 -
+ drivers/clk/tegra/clk-bpmp.c                  |   21 +-
+ drivers/input/joystick/analog.c               |   23 +-
+ drivers/md/bcache/Kconfig                     |   10 +-
+ drivers/md/bcache/Makefile                    |    4 +-
+ drivers/md/bcache/bcache.h                    |    2 +-
+ drivers/md/bcache/super.c                     |    1 -
+ drivers/md/bcache/util.h                      |    3 +-
+ drivers/pci/p2pdma.c                          |   21 +-
+ fs/Kconfig                                    |    1 +
+ fs/Makefile                                   |    1 +
+ fs/bcachefs/Kconfig                           |   59 +
+ fs/bcachefs/Makefile                          |   69 +
+ fs/bcachefs/acl.c                             |  406 ++
+ fs/bcachefs/acl.h                             |   58 +
+ fs/bcachefs/alloc_background.c                | 1552 ++++++++
+ fs/bcachefs/alloc_background.h                |  183 +
+ fs/bcachefs/alloc_foreground.c                | 1380 +++++++
+ fs/bcachefs/alloc_foreground.h                |  181 +
+ fs/bcachefs/alloc_types.h                     |   87 +
+ fs/bcachefs/backpointers.c                    |  875 ++++
+ fs/bcachefs/backpointers.h                    |   38 +
+ fs/bcachefs/bcachefs.h                        | 1000 +++++
+ fs/bcachefs/bcachefs_format.h                 | 2052 ++++++++++
+ fs/bcachefs/bcachefs_ioctl.h                  |  368 ++
+ fs/bcachefs/bkey.c                            | 1175 ++++++
+ fs/bcachefs/bkey.h                            |  566 +++
+ fs/bcachefs/bkey_buf.h                        |   60 +
+ fs/bcachefs/bkey_methods.c                    |  503 +++
+ fs/bcachefs/bkey_methods.h                    |  175 +
+ fs/bcachefs/bkey_sort.c                       |  198 +
+ fs/bcachefs/bkey_sort.h                       |   44 +
+ fs/bcachefs/bset.c                            | 1598 ++++++++
+ fs/bcachefs/bset.h                            |  615 +++
+ fs/bcachefs/btree_cache.c                     | 1170 ++++++
+ fs/bcachefs/btree_cache.h                     |  107 +
+ fs/bcachefs/btree_gc.c                        | 2098 ++++++++++
+ fs/bcachefs/btree_gc.h                        |  112 +
+ fs/bcachefs/btree_io.c                        | 2150 ++++++++++
+ fs/bcachefs/btree_io.h                        |  222 ++
+ fs/bcachefs/btree_iter.c                      | 3515 +++++++++++++++++
+ fs/bcachefs/btree_iter.h                      |  556 +++
+ fs/bcachefs/btree_key_cache.c                 |  855 ++++
+ fs/bcachefs/btree_key_cache.h                 |   47 +
+ fs/bcachefs/btree_locking.h                   |  289 ++
+ fs/bcachefs/btree_types.h                     |  697 ++++
+ fs/bcachefs/btree_update.h                    |  158 +
+ fs/bcachefs/btree_update_interior.c           | 2266 +++++++++++
+ fs/bcachefs/btree_update_interior.h           |  321 ++
+ fs/bcachefs/btree_update_leaf.c               | 1800 +++++++++
+ fs/bcachefs/buckets.c                         | 2113 ++++++++++
+ fs/bcachefs/buckets.h                         |  300 ++
+ fs/bcachefs/buckets_types.h                   |  103 +
+ fs/bcachefs/buckets_waiting_for_journal.c     |  167 +
+ fs/bcachefs/buckets_waiting_for_journal.h     |   15 +
+ .../buckets_waiting_for_journal_types.h       |   23 +
+ fs/bcachefs/chardev.c                         |  760 ++++
+ fs/bcachefs/chardev.h                         |   31 +
+ fs/bcachefs/checksum.c                        |  712 ++++
+ fs/bcachefs/checksum.h                        |  204 +
+ fs/bcachefs/clock.c                           |  191 +
+ fs/bcachefs/clock.h                           |   38 +
+ fs/bcachefs/clock_types.h                     |   37 +
+ fs/bcachefs/compress.c                        |  639 +++
+ fs/bcachefs/compress.h                        |   18 +
+ fs/bcachefs/counters.c                        |  107 +
+ fs/bcachefs/counters.h                        |   17 +
+ fs/bcachefs/darray.h                          |   77 +
+ fs/bcachefs/data_update.c                     |  376 ++
+ fs/bcachefs/data_update.h                     |   38 +
+ fs/bcachefs/debug.c                           |  764 ++++
+ fs/bcachefs/debug.h                           |   30 +
+ fs/bcachefs/dirent.c                          |  565 +++
+ fs/bcachefs/dirent.h                          |   67 +
+ fs/bcachefs/disk_groups.c                     |  506 +++
+ fs/bcachefs/disk_groups.h                     |   90 +
+ fs/bcachefs/ec.c                              | 1673 ++++++++
+ fs/bcachefs/ec.h                              |  230 ++
+ fs/bcachefs/ec_types.h                        |   46 +
+ fs/bcachefs/errcode.c                         |   51 +
+ fs/bcachefs/errcode.h                         |   64 +
+ fs/bcachefs/error.c                           |  184 +
+ fs/bcachefs/error.h                           |  223 ++
+ fs/bcachefs/extent_update.c                   |  178 +
+ fs/bcachefs/extent_update.h                   |   12 +
+ fs/bcachefs/extents.c                         | 1324 +++++++
+ fs/bcachefs/extents.h                         |  685 ++++
+ fs/bcachefs/extents_types.h                   |   40 +
+ fs/bcachefs/eytzinger.h                       |  281 ++
+ fs/bcachefs/fifo.h                            |  127 +
+ fs/bcachefs/fs-common.c                       |  496 +++
+ fs/bcachefs/fs-common.h                       |   43 +
+ fs/bcachefs/fs-io.c                           | 3496 ++++++++++++++++
+ fs/bcachefs/fs-io.h                           |   56 +
+ fs/bcachefs/fs-ioctl.c                        |  523 +++
+ fs/bcachefs/fs-ioctl.h                        |   81 +
+ fs/bcachefs/fs.c                              | 1939 +++++++++
+ fs/bcachefs/fs.h                              |  208 +
+ fs/bcachefs/fsck.c                            | 2390 +++++++++++
+ fs/bcachefs/fsck.h                            |    8 +
+ fs/bcachefs/inode.c                           |  771 ++++
+ fs/bcachefs/inode.h                           |  189 +
+ fs/bcachefs/io.c                              | 2422 ++++++++++++
+ fs/bcachefs/io.h                              |  189 +
+ fs/bcachefs/io_types.h                        |  161 +
+ fs/bcachefs/journal.c                         | 1429 +++++++
+ fs/bcachefs/journal.h                         |  521 +++
+ fs/bcachefs/journal_io.c                      | 1735 ++++++++
+ fs/bcachefs/journal_io.h                      |   59 +
+ fs/bcachefs/journal_reclaim.c                 |  852 ++++
+ fs/bcachefs/journal_reclaim.h                 |   86 +
+ fs/bcachefs/journal_sb.c                      |  220 ++
+ fs/bcachefs/journal_sb.h                      |   24 +
+ fs/bcachefs/journal_seq_blacklist.c           |  322 ++
+ fs/bcachefs/journal_seq_blacklist.h           |   22 +
+ fs/bcachefs/journal_types.h                   |  340 ++
+ fs/bcachefs/keylist.c                         |   67 +
+ fs/bcachefs/keylist.h                         |   76 +
+ fs/bcachefs/keylist_types.h                   |   16 +
+ fs/bcachefs/lru.c                             |  206 +
+ fs/bcachefs/lru.h                             |   19 +
+ fs/bcachefs/migrate.c                         |  186 +
+ fs/bcachefs/migrate.h                         |    7 +
+ fs/bcachefs/move.c                            |  952 +++++
+ fs/bcachefs/move.h                            |   67 +
+ fs/bcachefs/move_types.h                      |   19 +
+ fs/bcachefs/movinggc.c                        |  285 ++
+ fs/bcachefs/movinggc.h                        |   10 +
+ fs/bcachefs/opts.c                            |  578 +++
+ fs/bcachefs/opts.h                            |  509 +++
+ fs/bcachefs/quota.c                           |  823 ++++
+ fs/bcachefs/quota.h                           |   71 +
+ fs/bcachefs/quota_types.h                     |   43 +
+ fs/bcachefs/rebalance.c                       |  361 ++
+ fs/bcachefs/rebalance.h                       |   28 +
+ fs/bcachefs/rebalance_types.h                 |   26 +
+ fs/bcachefs/recovery.c                        | 1597 ++++++++
+ fs/bcachefs/recovery.h                        |   58 +
+ fs/bcachefs/reflink.c                         |  422 ++
+ fs/bcachefs/reflink.h                         |   76 +
+ fs/bcachefs/replicas.c                        | 1073 +++++
+ fs/bcachefs/replicas.h                        |  106 +
+ fs/bcachefs/replicas_types.h                  |   10 +
+ fs/bcachefs/siphash.c                         |  173 +
+ fs/bcachefs/siphash.h                         |   87 +
+ fs/bcachefs/str_hash.h                        |  351 ++
+ fs/bcachefs/subvolume.c                       | 1108 ++++++
+ fs/bcachefs/subvolume.h                       |  137 +
+ fs/bcachefs/subvolume_types.h                 |    9 +
+ fs/bcachefs/super-io.c                        | 1602 ++++++++
+ fs/bcachefs/super-io.h                        |  126 +
+ fs/bcachefs/super.c                           | 1950 +++++++++
+ fs/bcachefs/super.h                           |  264 ++
+ fs/bcachefs/super_types.h                     |   51 +
+ fs/bcachefs/sysfs.c                           |  943 +++++
+ fs/bcachefs/sysfs.h                           |   48 +
+ fs/bcachefs/tests.c                           |  976 +++++
+ fs/bcachefs/tests.h                           |   15 +
+ fs/bcachefs/trace.c                           |   12 +
+ fs/bcachefs/util.c                            |  964 +++++
+ fs/bcachefs/util.h                            |  783 ++++
+ fs/bcachefs/varint.c                          |  121 +
+ fs/bcachefs/varint.h                          |   11 +
+ fs/bcachefs/vstructs.h                        |   63 +
+ fs/bcachefs/xattr.c                           |  648 +++
+ fs/bcachefs/xattr.h                           |   50 +
+ fs/d_path.c                                   |   35 +
+ fs/dcache.c                                   |   10 +-
+ fs/inode.c                                    |  218 +-
+ include/linux/bio.h                           |    7 +-
+ include/linux/blkdev.h                        |    1 +
+ .../md/bcache => include/linux}/closure.h     |   39 +-
+ include/linux/compiler_attributes.h           |    5 +
+ include/linux/dcache.h                        |    2 +
+ include/linux/exportfs.h                      |    6 +
+ include/linux/fs.h                            |    9 +-
+ include/linux/generic-radix-tree.h            |   68 +-
+ include/linux/kernel.h                        |   12 +
+ include/linux/list_bl.h                       |   22 +
+ include/linux/lockdep.h                       |    4 +
+ include/linux/pretty-printers.h               |   10 +
+ include/linux/printbuf.h                      |  283 ++
+ include/linux/sched.h                         |    1 +
+ include/linux/seq_buf.h                       |  162 -
+ include/linux/shrinker.h                      |    8 +
+ include/linux/six.h                           |  203 +
+ include/linux/string.h                        |    5 +
+ include/linux/string_helpers.h                |    8 +-
+ include/linux/trace_events.h                  |    2 +-
+ include/linux/trace_seq.h                     |   17 +-
+ include/linux/vmalloc.h                       |    1 +
+ include/net/9p/9p.h                           |    2 +-
+ include/net/9p/client.h                       |   20 +-
+ include/trace/events/bcachefs.h               | 1048 +++++
+ init/init_task.c                              |    1 +
+ kernel/Kconfig.locks                          |    3 +
+ kernel/locking/Makefile                       |    1 +
+ kernel/locking/lockdep.c                      |   20 +
+ kernel/locking/six.c                          |  759 ++++
+ kernel/module/main.c                          |    4 +-
+ kernel/stacktrace.c                           |    2 +
+ kernel/trace/trace.c                          |   45 +-
+ kernel/trace/trace_dynevent.c                 |   34 +-
+ kernel/trace/trace_events_filter.c            |    2 +-
+ kernel/trace/trace_events_synth.c             |   32 +-
+ kernel/trace/trace_functions_graph.c          |    6 +-
+ kernel/trace/trace_kprobe.c                   |    2 +-
+ kernel/trace/trace_seq.c                      |  111 +-
+ lib/Kconfig                                   |    3 +
+ lib/Kconfig.debug                             |    9 +
+ lib/Makefile                                  |    8 +-
+ {drivers/md/bcache => lib}/closure.c          |   35 +-
+ lib/errname.c                                 |    1 +
+ lib/generic-radix-tree.c                      |   76 +-
+ lib/hexdump.c                                 |  246 +-
+ lib/pretty-printers.c                         |   60 +
+ lib/printbuf.c                                |  258 ++
+ lib/seq_buf.c                                 |  397 --
+ lib/string_helpers.c                          |  224 +-
+ lib/test_hexdump.c                            |   30 +-
+ lib/test_printf.c                             |   33 +-
+ lib/vsprintf.c                                | 1740 ++++----
+ mm/Makefile                                   |    2 +-
+ mm/filemap.c                                  |    1 +
+ mm/memcontrol.c                               |   68 +-
+ mm/nommu.c                                    |   18 +
+ mm/oom_kill.c                                 |   23 -
+ {lib => mm}/show_mem.c                        |    8 +
+ mm/slab.h                                     |    6 +-
+ mm/slab_common.c                              |   53 +-
+ mm/vmalloc.c                                  |   21 +
+ mm/vmscan.c                                   |   88 +
+ net/9p/client.c                               |   97 +-
+ net/9p/trans_fd.c                             |   12 +-
+ net/9p/trans_rdma.c                           |    4 +-
+ net/9p/trans_virtio.c                         |    4 +-
+ net/9p/trans_xen.c                            |    2 +-
+ tools/testing/nvdimm/test/ndtest.c            |   22 +-
+ 248 files changed, 84382 insertions(+), 2223 deletions(-)
+ create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
+ create mode 100644 fs/bcachefs/Kconfig
+ create mode 100644 fs/bcachefs/Makefile
+ create mode 100644 fs/bcachefs/acl.c
+ create mode 100644 fs/bcachefs/acl.h
+ create mode 100644 fs/bcachefs/alloc_background.c
+ create mode 100644 fs/bcachefs/alloc_background.h
+ create mode 100644 fs/bcachefs/alloc_foreground.c
+ create mode 100644 fs/bcachefs/alloc_foreground.h
+ create mode 100644 fs/bcachefs/alloc_types.h
+ create mode 100644 fs/bcachefs/backpointers.c
+ create mode 100644 fs/bcachefs/backpointers.h
+ create mode 100644 fs/bcachefs/bcachefs.h
+ create mode 100644 fs/bcachefs/bcachefs_format.h
+ create mode 100644 fs/bcachefs/bcachefs_ioctl.h
+ create mode 100644 fs/bcachefs/bkey.c
+ create mode 100644 fs/bcachefs/bkey.h
+ create mode 100644 fs/bcachefs/bkey_buf.h
+ create mode 100644 fs/bcachefs/bkey_methods.c
+ create mode 100644 fs/bcachefs/bkey_methods.h
+ create mode 100644 fs/bcachefs/bkey_sort.c
+ create mode 100644 fs/bcachefs/bkey_sort.h
+ create mode 100644 fs/bcachefs/bset.c
+ create mode 100644 fs/bcachefs/bset.h
+ create mode 100644 fs/bcachefs/btree_cache.c
+ create mode 100644 fs/bcachefs/btree_cache.h
+ create mode 100644 fs/bcachefs/btree_gc.c
+ create mode 100644 fs/bcachefs/btree_gc.h
+ create mode 100644 fs/bcachefs/btree_io.c
+ create mode 100644 fs/bcachefs/btree_io.h
+ create mode 100644 fs/bcachefs/btree_iter.c
+ create mode 100644 fs/bcachefs/btree_iter.h
+ create mode 100644 fs/bcachefs/btree_key_cache.c
+ create mode 100644 fs/bcachefs/btree_key_cache.h
+ create mode 100644 fs/bcachefs/btree_locking.h
+ create mode 100644 fs/bcachefs/btree_types.h
+ create mode 100644 fs/bcachefs/btree_update.h
+ create mode 100644 fs/bcachefs/btree_update_interior.c
+ create mode 100644 fs/bcachefs/btree_update_interior.h
+ create mode 100644 fs/bcachefs/btree_update_leaf.c
+ create mode 100644 fs/bcachefs/buckets.c
+ create mode 100644 fs/bcachefs/buckets.h
+ create mode 100644 fs/bcachefs/buckets_types.h
+ create mode 100644 fs/bcachefs/buckets_waiting_for_journal.c
+ create mode 100644 fs/bcachefs/buckets_waiting_for_journal.h
+ create mode 100644 fs/bcachefs/buckets_waiting_for_journal_types.h
+ create mode 100644 fs/bcachefs/chardev.c
+ create mode 100644 fs/bcachefs/chardev.h
+ create mode 100644 fs/bcachefs/checksum.c
+ create mode 100644 fs/bcachefs/checksum.h
+ create mode 100644 fs/bcachefs/clock.c
+ create mode 100644 fs/bcachefs/clock.h
+ create mode 100644 fs/bcachefs/clock_types.h
+ create mode 100644 fs/bcachefs/compress.c
+ create mode 100644 fs/bcachefs/compress.h
+ create mode 100644 fs/bcachefs/counters.c
+ create mode 100644 fs/bcachefs/counters.h
+ create mode 100644 fs/bcachefs/darray.h
+ create mode 100644 fs/bcachefs/data_update.c
+ create mode 100644 fs/bcachefs/data_update.h
+ create mode 100644 fs/bcachefs/debug.c
+ create mode 100644 fs/bcachefs/debug.h
+ create mode 100644 fs/bcachefs/dirent.c
+ create mode 100644 fs/bcachefs/dirent.h
+ create mode 100644 fs/bcachefs/disk_groups.c
+ create mode 100644 fs/bcachefs/disk_groups.h
+ create mode 100644 fs/bcachefs/ec.c
+ create mode 100644 fs/bcachefs/ec.h
+ create mode 100644 fs/bcachefs/ec_types.h
+ create mode 100644 fs/bcachefs/errcode.c
+ create mode 100644 fs/bcachefs/errcode.h
+ create mode 100644 fs/bcachefs/error.c
+ create mode 100644 fs/bcachefs/error.h
+ create mode 100644 fs/bcachefs/extent_update.c
+ create mode 100644 fs/bcachefs/extent_update.h
+ create mode 100644 fs/bcachefs/extents.c
+ create mode 100644 fs/bcachefs/extents.h
+ create mode 100644 fs/bcachefs/extents_types.h
+ create mode 100644 fs/bcachefs/eytzinger.h
+ create mode 100644 fs/bcachefs/fifo.h
+ create mode 100644 fs/bcachefs/fs-common.c
+ create mode 100644 fs/bcachefs/fs-common.h
+ create mode 100644 fs/bcachefs/fs-io.c
+ create mode 100644 fs/bcachefs/fs-io.h
+ create mode 100644 fs/bcachefs/fs-ioctl.c
+ create mode 100644 fs/bcachefs/fs-ioctl.h
+ create mode 100644 fs/bcachefs/fs.c
+ create mode 100644 fs/bcachefs/fs.h
+ create mode 100644 fs/bcachefs/fsck.c
+ create mode 100644 fs/bcachefs/fsck.h
+ create mode 100644 fs/bcachefs/inode.c
+ create mode 100644 fs/bcachefs/inode.h
+ create mode 100644 fs/bcachefs/io.c
+ create mode 100644 fs/bcachefs/io.h
+ create mode 100644 fs/bcachefs/io_types.h
+ create mode 100644 fs/bcachefs/journal.c
+ create mode 100644 fs/bcachefs/journal.h
+ create mode 100644 fs/bcachefs/journal_io.c
+ create mode 100644 fs/bcachefs/journal_io.h
+ create mode 100644 fs/bcachefs/journal_reclaim.c
+ create mode 100644 fs/bcachefs/journal_reclaim.h
+ create mode 100644 fs/bcachefs/journal_sb.c
+ create mode 100644 fs/bcachefs/journal_sb.h
+ create mode 100644 fs/bcachefs/journal_seq_blacklist.c
+ create mode 100644 fs/bcachefs/journal_seq_blacklist.h
+ create mode 100644 fs/bcachefs/journal_types.h
+ create mode 100644 fs/bcachefs/keylist.c
+ create mode 100644 fs/bcachefs/keylist.h
+ create mode 100644 fs/bcachefs/keylist_types.h
+ create mode 100644 fs/bcachefs/lru.c
+ create mode 100644 fs/bcachefs/lru.h
+ create mode 100644 fs/bcachefs/migrate.c
+ create mode 100644 fs/bcachefs/migrate.h
+ create mode 100644 fs/bcachefs/move.c
+ create mode 100644 fs/bcachefs/move.h
+ create mode 100644 fs/bcachefs/move_types.h
+ create mode 100644 fs/bcachefs/movinggc.c
+ create mode 100644 fs/bcachefs/movinggc.h
+ create mode 100644 fs/bcachefs/opts.c
+ create mode 100644 fs/bcachefs/opts.h
+ create mode 100644 fs/bcachefs/quota.c
+ create mode 100644 fs/bcachefs/quota.h
+ create mode 100644 fs/bcachefs/quota_types.h
+ create mode 100644 fs/bcachefs/rebalance.c
+ create mode 100644 fs/bcachefs/rebalance.h
+ create mode 100644 fs/bcachefs/rebalance_types.h
+ create mode 100644 fs/bcachefs/recovery.c
+ create mode 100644 fs/bcachefs/recovery.h
+ create mode 100644 fs/bcachefs/reflink.c
+ create mode 100644 fs/bcachefs/reflink.h
+ create mode 100644 fs/bcachefs/replicas.c
+ create mode 100644 fs/bcachefs/replicas.h
+ create mode 100644 fs/bcachefs/replicas_types.h
+ create mode 100644 fs/bcachefs/siphash.c
+ create mode 100644 fs/bcachefs/siphash.h
+ create mode 100644 fs/bcachefs/str_hash.h
+ create mode 100644 fs/bcachefs/subvolume.c
+ create mode 100644 fs/bcachefs/subvolume.h
+ create mode 100644 fs/bcachefs/subvolume_types.h
+ create mode 100644 fs/bcachefs/super-io.c
+ create mode 100644 fs/bcachefs/super-io.h
+ create mode 100644 fs/bcachefs/super.c
+ create mode 100644 fs/bcachefs/super.h
+ create mode 100644 fs/bcachefs/super_types.h
+ create mode 100644 fs/bcachefs/sysfs.c
+ create mode 100644 fs/bcachefs/sysfs.h
+ create mode 100644 fs/bcachefs/tests.c
+ create mode 100644 fs/bcachefs/tests.h
+ create mode 100644 fs/bcachefs/trace.c
+ create mode 100644 fs/bcachefs/util.c
+ create mode 100644 fs/bcachefs/util.h
+ create mode 100644 fs/bcachefs/varint.c
+ create mode 100644 fs/bcachefs/varint.h
+ create mode 100644 fs/bcachefs/vstructs.h
+ create mode 100644 fs/bcachefs/xattr.c
+ create mode 100644 fs/bcachefs/xattr.h
+ rename {drivers/md/bcache => include/linux}/closure.h (94%)
+ create mode 100644 include/linux/pretty-printers.h
+ create mode 100644 include/linux/printbuf.h
+ delete mode 100644 include/linux/seq_buf.h
+ create mode 100644 include/linux/six.h
+ create mode 100644 include/trace/events/bcachefs.h
+ create mode 100644 kernel/locking/six.c
+ rename {drivers/md/bcache => lib}/closure.c (88%)
+ create mode 100644 lib/pretty-printers.c
+ create mode 100644 lib/printbuf.c
+ delete mode 100644 lib/seq_buf.c
+ rename {lib => mm}/show_mem.c (83%)
+
+diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
+new file mode 100644
+index 000000000000..8af34357dd98
+--- /dev/null
++++ b/.github/ISSUE_TEMPLATE/bug_report.md
+@@ -0,0 +1,61 @@
++---
++name: Bug report
++about: Create a report to help us improve
++title: "<short description> [short commit id]"
++labels: bug
++assignees: YellowOnion
++
++---
++
++**Please search for duplicates**
++
++**Version**
++
++Make sure you're using a reasonably new version.
++
++Provide the commit hash from the kernel version (preferable) or tools, don't say "I'm using the latest master" as that will very quickly become out of date.
++
++**Generic info**
++Provide the output of:
++```
++bcachefs fs usage
++bcachefs show-super
++```
++**Tools bugs**
++
++* pull the latest version, compile it, do not strip the binary.
++* provide the exact commands you used to run.
++* run with gdb: `gdb -ex run --args ./bcacehfs <arguments...>`
++
++If you get an assert/segfault etc:
++* type `bt` in to and provide the output here.
++
++If the tools lockup:
++* run `perf top -p $(pidof bcachefs)` and provide a screenshot.
++* press ctrl+c to interrupt the process and provide the output of `bt`.
++
++**Kernel bugs**
++Compile the kernel with these flags:
++
++```
++CONFIG_PREEMPT=y
++CONFIG_BCACHEFS_DEBUG=y
++CONFIG_KALLSYMS=y
++CONFIG_KALLSYMS_ALL=y
++CONFIG_DEBUG_FS=y
++CONFIG_DYNAMIC_FTRACE=y
++CONFIG_FTRACE=y
++```
++Provide the output of `dmesg` either in a paste-bin or as attachment, if less than 30~ lines just provide inline here.
++
++
++**Optional Advanced**
++
++If lockup or performance issues:
++* run `perf record` and `perf record -e 'bcachefs:*' -o events.data` both during the window of issue and then ctrl+c.
++* run `perf archive` to dump symbols.
++* archive, compress and upload the files: `perf.data`, `events.data` and `perf.data.tar.bz2`.
++
++Upload large files to a file storage provider:
++* provide the output of `bcachefs list_journal -a <list of devices> | zstd -f -T0 -o ../journal.log.zst`
++*compress & upload all the `metdata.dump.*` files from: bcachefs dump -o metadata.dump <list of devices>
+diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
+index 5e89497ba314..4f4a35b3aadc 100644
+--- a/Documentation/core-api/printk-formats.rst
++++ b/Documentation/core-api/printk-formats.rst
+@@ -625,6 +625,28 @@ Examples::
+ 	%p4cc	Y10  little-endian (0x20303159)
+ 	%p4cc	NV12 big-endian (0xb231564e)
+ 
++Calling a pretty printer function
++---------------------------------
++
++::
++
++        %pf(%p)     pretty printer function taking one argument
++        %pf(%p,%p)  pretty printer function taking two arguments
++
++For calling generic pretty printers. A pretty printer is a function that takes
++as its first argument a pointer to a printbuf, and then zero or more additional
++pointer arguments. For example:
++
++        void foo_to_text(struct printbuf *out, struct foo *foo)
++        {
++                pr_buf(out, "bar=%u baz=%u", foo->bar, foo->baz);
++        }
++
++        printf("%pf(%p)", CALL_PP(foo_to_text, foo));
++
++Note that a pretty-printer may not sleep if called from printk(). If called from
++pr_buf() or sprintf() there are no such restrictions.
++
+ Thanks
+ ======
+ 
+diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
+index 0fbda89cd1bb..05654dbeb2c4 100644
+--- a/arch/powerpc/kernel/process.c
++++ b/arch/powerpc/kernel/process.c
+@@ -37,7 +37,7 @@
+ #include <linux/hw_breakpoint.h>
+ #include <linux/uaccess.h>
+ #include <linux/pkeys.h>
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ 
+ #include <asm/interrupt.h>
+ #include <asm/io.h>
+@@ -1396,32 +1396,30 @@ void show_user_instructions(struct pt_regs *regs)
+ {
+ 	unsigned long pc;
+ 	int n = NR_INSN_TO_PRINT;
+-	struct seq_buf s;
+ 	char buf[96]; /* enough for 8 times 9 + 2 chars */
++	struct printbuf s = PRINTBUF_EXTERN(buf, sizeof(buf));
+ 
+ 	pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int));
+ 
+-	seq_buf_init(&s, buf, sizeof(buf));
+-
+ 	while (n) {
+ 		int i;
+ 
+-		seq_buf_clear(&s);
++		printbuf_reset(&s);
+ 
+ 		for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) {
+ 			int instr;
+ 
+ 			if (copy_from_user_nofault(&instr, (void __user *)pc,
+ 					sizeof(instr))) {
+-				seq_buf_printf(&s, "XXXXXXXX ");
++				prt_printf(&s, "XXXXXXXX ");
+ 				continue;
+ 			}
+-			seq_buf_printf(&s, regs->nip == pc ? "<%08x> " : "%08x ", instr);
++			prt_printf(&s, regs->nip == pc ? "<%08x> " : "%08x ", instr);
+ 		}
+ 
+-		if (!seq_buf_has_overflowed(&s))
++		if (printbuf_remaining(&s))
+ 			pr_info("%s[%d]: code: %s\n", current->comm,
+-				current->pid, s.buffer);
++				current->pid, s.buf);
+ 	}
+ }
+ 
+diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
+index d96fd14bd7c9..b34de62e65ce 100644
+--- a/arch/powerpc/kernel/security.c
++++ b/arch/powerpc/kernel/security.c
+@@ -10,7 +10,7 @@
+ #include <linux/memblock.h>
+ #include <linux/nospec.h>
+ #include <linux/prctl.h>
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ #include <linux/debugfs.h>
+ 
+ #include <asm/asm-prototypes.h>
+@@ -144,31 +144,28 @@ void __init setup_spectre_v2(void)
+ #ifdef CONFIG_PPC_BOOK3S_64
+ ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+ {
++	struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
+ 	bool thread_priv;
+ 
+ 	thread_priv = security_ftr_enabled(SEC_FTR_L1D_THREAD_PRIV);
+ 
+ 	if (rfi_flush) {
+-		struct seq_buf s;
+-		seq_buf_init(&s, buf, PAGE_SIZE - 1);
+ 
+-		seq_buf_printf(&s, "Mitigation: RFI Flush");
++		prt_printf(&s, "Mitigation: RFI Flush");
+ 		if (thread_priv)
+-			seq_buf_printf(&s, ", L1D private per thread");
+-
+-		seq_buf_printf(&s, "\n");
+-
+-		return s.len;
++			prt_printf(&s, ", L1D private per thread");
++
++		prt_printf(&s, "\n");
++	} else if (thread_priv) {
++		prt_printf(&s, "Vulnerable: L1D private per thread\n");
++	} else if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) &&
++		   !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR)) {
++		prt_printf(&s, "Not affected\n");
++	} else {
++		prt_printf(&s, "Vulnerable\n");
+ 	}
+ 
+-	if (thread_priv)
+-		return sprintf(buf, "Vulnerable: L1D private per thread\n");
+-
+-	if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) &&
+-	    !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR))
+-		return sprintf(buf, "Not affected\n");
+-
+-	return sprintf(buf, "Vulnerable\n");
++	return printbuf_written(&s);
+ }
+ 
+ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+@@ -179,70 +176,66 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b
+ 
+ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
+ {
+-	struct seq_buf s;
+-
+-	seq_buf_init(&s, buf, PAGE_SIZE - 1);
++	struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
+ 
+ 	if (security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR)) {
+ 		if (barrier_nospec_enabled)
+-			seq_buf_printf(&s, "Mitigation: __user pointer sanitization");
++			prt_printf(&s, "Mitigation: __user pointer sanitization");
+ 		else
+-			seq_buf_printf(&s, "Vulnerable");
++			prt_printf(&s, "Vulnerable");
+ 
+ 		if (security_ftr_enabled(SEC_FTR_SPEC_BAR_ORI31))
+-			seq_buf_printf(&s, ", ori31 speculation barrier enabled");
++			prt_printf(&s, ", ori31 speculation barrier enabled");
+ 
+-		seq_buf_printf(&s, "\n");
++		prt_printf(&s, "\n");
+ 	} else
+-		seq_buf_printf(&s, "Not affected\n");
++		prt_printf(&s, "Not affected\n");
+ 
+-	return s.len;
++	return printbuf_written(&s);
+ }
+ 
+ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
+ {
+-	struct seq_buf s;
++	struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
+ 	bool bcs, ccd;
+ 
+-	seq_buf_init(&s, buf, PAGE_SIZE - 1);
+-
+ 	bcs = security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED);
+ 	ccd = security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED);
+ 
+ 	if (bcs || ccd) {
+-		seq_buf_printf(&s, "Mitigation: ");
++		prt_printf(&s, "Mitigation: ");
+ 
+ 		if (bcs)
+-			seq_buf_printf(&s, "Indirect branch serialisation (kernel only)");
++			prt_printf(&s, "Indirect branch serialisation (kernel only)");
+ 
+ 		if (bcs && ccd)
+-			seq_buf_printf(&s, ", ");
++			prt_printf(&s, ", ");
+ 
+ 		if (ccd)
+-			seq_buf_printf(&s, "Indirect branch cache disabled");
++			prt_printf(&s, "Indirect branch cache disabled");
+ 
+ 	} else if (count_cache_flush_type != BRANCH_CACHE_FLUSH_NONE) {
+-		seq_buf_printf(&s, "Mitigation: Software count cache flush");
++		prt_printf(&s, "Mitigation: Software count cache flush");
+ 
+ 		if (count_cache_flush_type == BRANCH_CACHE_FLUSH_HW)
+-			seq_buf_printf(&s, " (hardware accelerated)");
++			prt_printf(&s, " (hardware accelerated)");
+ 
+ 	} else if (btb_flush_enabled) {
+-		seq_buf_printf(&s, "Mitigation: Branch predictor state flush");
++		prt_printf(&s, "Mitigation: Branch predictor state flush");
+ 	} else {
+-		seq_buf_printf(&s, "Vulnerable");
++		prt_printf(&s, "Vulnerable");
+ 	}
+ 
+ 	if (bcs || ccd || count_cache_flush_type != BRANCH_CACHE_FLUSH_NONE) {
+ 		if (link_stack_flush_type != BRANCH_CACHE_FLUSH_NONE)
+-			seq_buf_printf(&s, ", Software link stack flush");
++			prt_printf(&s, ", Software link stack flush");
+ 		if (link_stack_flush_type == BRANCH_CACHE_FLUSH_HW)
+-			seq_buf_printf(&s, " (hardware accelerated)");
++			prt_printf(&s, " (hardware accelerated)");
+ 	}
+ 
+-	seq_buf_printf(&s, "\n");
++	prt_printf(&s, "\n");
+ 
+-	return s.len;
++	return printbuf_written(&s);
+ }
+ 
+ #ifdef CONFIG_PPC_BOOK3S_64
+diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
+index 82cae08976bc..fe2b41858b5f 100644
+--- a/arch/powerpc/platforms/pseries/papr_scm.c
++++ b/arch/powerpc/platforms/pseries/papr_scm.c
+@@ -12,7 +12,7 @@
+ #include <linux/libnvdimm.h>
+ #include <linux/platform_device.h>
+ #include <linux/delay.h>
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ #include <linux/nd.h>
+ 
+ #include <asm/plpar_wrappers.h>
+@@ -1142,7 +1142,7 @@ static ssize_t perf_stats_show(struct device *dev,
+ {
+ 	int index;
+ 	ssize_t rc;
+-	struct seq_buf s;
++	struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
+ 	struct papr_scm_perf_stat *stat;
+ 	struct papr_scm_perf_stats *stats;
+ 	struct nvdimm *dimm = to_nvdimm(dev);
+@@ -1165,18 +1165,17 @@ static ssize_t perf_stats_show(struct device *dev,
+ 	 * values. Since stat_id is essentially a char string of
+ 	 * 8 bytes, simply use the string format specifier to print it.
+ 	 */
+-	seq_buf_init(&s, buf, PAGE_SIZE);
+ 	for (index = 0, stat = stats->scm_statistic;
+ 	     index < be32_to_cpu(stats->num_statistics);
+ 	     ++index, ++stat) {
+-		seq_buf_printf(&s, "%.8s = 0x%016llX\n",
+-			       stat->stat_id,
+-			       be64_to_cpu(stat->stat_val));
++		prt_printf(&s, "%.8s = 0x%016llX\n",
++		       stat->stat_id,
++		       be64_to_cpu(stat->stat_val));
+ 	}
+ 
+ free_stats:
+ 	kfree(stats);
+-	return rc ? rc : (ssize_t)seq_buf_used(&s);
++	return rc ?: printbuf_written(&s);
+ }
+ static DEVICE_ATTR_ADMIN_RO(perf_stats);
+ 
+@@ -1185,7 +1184,7 @@ static ssize_t flags_show(struct device *dev,
+ {
+ 	struct nvdimm *dimm = to_nvdimm(dev);
+ 	struct papr_scm_priv *p = nvdimm_provider_data(dimm);
+-	struct seq_buf s;
++	struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
+ 	u64 health;
+ 	int rc;
+ 
+@@ -1196,29 +1195,28 @@ static ssize_t flags_show(struct device *dev,
+ 	/* Copy health_bitmap locally, check masks & update out buffer */
+ 	health = READ_ONCE(p->health_bitmap);
+ 
+-	seq_buf_init(&s, buf, PAGE_SIZE);
+ 	if (health & PAPR_PMEM_UNARMED_MASK)
+-		seq_buf_printf(&s, "not_armed ");
++		prt_printf(&s, "not_armed ");
+ 
+ 	if (health & PAPR_PMEM_BAD_SHUTDOWN_MASK)
+-		seq_buf_printf(&s, "flush_fail ");
++		prt_printf(&s, "flush_fail ");
+ 
+ 	if (health & PAPR_PMEM_BAD_RESTORE_MASK)
+-		seq_buf_printf(&s, "restore_fail ");
++		prt_printf(&s, "restore_fail ");
+ 
+ 	if (health & PAPR_PMEM_ENCRYPTED)
+-		seq_buf_printf(&s, "encrypted ");
++		prt_printf(&s, "encrypted ");
+ 
+ 	if (health & PAPR_PMEM_SMART_EVENT_MASK)
+-		seq_buf_printf(&s, "smart_notify ");
++		prt_printf(&s, "smart_notify ");
+ 
+ 	if (health & PAPR_PMEM_SCRUBBED_AND_LOCKED)
+-		seq_buf_printf(&s, "scrubbed locked ");
++		prt_printf(&s, "scrubbed locked ");
+ 
+-	if (seq_buf_used(&s))
+-		seq_buf_printf(&s, "\n");
++	if (printbuf_written(&s))
++		prt_printf(&s, "\n");
+ 
+-	return seq_buf_used(&s);
++	return printbuf_written(&s);
+ }
+ DEVICE_ATTR_RO(flags);
+ 
+diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+index f276aff521e8..50c12711a249 100644
+--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
++++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+@@ -19,7 +19,7 @@
+ #include <linux/fs_parser.h>
+ #include <linux/sysfs.h>
+ #include <linux/kernfs.h>
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ #include <linux/seq_file.h>
+ #include <linux/sched/signal.h>
+ #include <linux/sched/task.h>
+@@ -51,7 +51,7 @@ static struct kernfs_node *kn_mongrp;
+ /* Kernel fs node for "mon_data" directory under root */
+ static struct kernfs_node *kn_mondata;
+ 
+-static struct seq_buf last_cmd_status;
++static struct printbuf last_cmd_status;
+ static char last_cmd_status_buf[512];
+ 
+ struct dentry *debugfs_resctrl;
+@@ -59,13 +59,13 @@ struct dentry *debugfs_resctrl;
+ void rdt_last_cmd_clear(void)
+ {
+ 	lockdep_assert_held(&rdtgroup_mutex);
+-	seq_buf_clear(&last_cmd_status);
++	printbuf_reset(&last_cmd_status);
+ }
+ 
+ void rdt_last_cmd_puts(const char *s)
+ {
+ 	lockdep_assert_held(&rdtgroup_mutex);
+-	seq_buf_puts(&last_cmd_status, s);
++	prt_str(&last_cmd_status, s);
+ }
+ 
+ void rdt_last_cmd_printf(const char *fmt, ...)
+@@ -74,7 +74,7 @@ void rdt_last_cmd_printf(const char *fmt, ...)
+ 
+ 	va_start(ap, fmt);
+ 	lockdep_assert_held(&rdtgroup_mutex);
+-	seq_buf_vprintf(&last_cmd_status, fmt, ap);
++	prt_vprintf(&last_cmd_status, fmt, ap);
+ 	va_end(ap);
+ }
+ 
+@@ -833,7 +833,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
+ 	int len;
+ 
+ 	mutex_lock(&rdtgroup_mutex);
+-	len = seq_buf_used(&last_cmd_status);
++	len = printbuf_written(&last_cmd_status);
+ 	if (len)
+ 		seq_printf(seq, "%.*s", len, last_cmd_status_buf);
+ 	else
+@@ -3248,8 +3248,8 @@ int __init rdtgroup_init(void)
+ {
+ 	int ret = 0;
+ 
+-	seq_buf_init(&last_cmd_status, last_cmd_status_buf,
+-		     sizeof(last_cmd_status_buf));
++	last_cmd_status = PRINTBUF_EXTERN(last_cmd_status_buf,
++					  sizeof(last_cmd_status_buf));
+ 
+ 	ret = rdtgroup_setup_root();
+ 	if (ret)
+diff --git a/block/bio.c b/block/bio.c
+index 51c99f2c5c90..2d0d7f13d59a 100644
+--- a/block/bio.c
++++ b/block/bio.c
+@@ -582,15 +582,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
+ }
+ EXPORT_SYMBOL(bio_kmalloc);
+ 
+-void zero_fill_bio(struct bio *bio)
++void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
+ {
+ 	struct bio_vec bv;
+ 	struct bvec_iter iter;
+ 
+-	bio_for_each_segment(bv, bio, iter)
++	__bio_for_each_segment(bv, bio, iter, start)
+ 		memzero_bvec(&bv);
+ }
+-EXPORT_SYMBOL(zero_fill_bio);
++EXPORT_SYMBOL(zero_fill_bio_iter);
+ 
+ /**
+  * bio_truncate - truncate the bio to small size of @new_size
+@@ -1363,17 +1363,27 @@ EXPORT_SYMBOL(__bio_advance);
+ void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+ 			struct bio *src, struct bvec_iter *src_iter)
+ {
++	struct bio_vec src_bv, dst_bv;
++	void *src_p, *dst_p;
++	unsigned bytes;
++
+ 	while (src_iter->bi_size && dst_iter->bi_size) {
+-		struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
+-		struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
+-		unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+-		void *src_buf = bvec_kmap_local(&src_bv);
+-		void *dst_buf = bvec_kmap_local(&dst_bv);
++		src_bv = bio_iter_iovec(src, *src_iter);
++		dst_bv = bio_iter_iovec(dst, *dst_iter);
++
++		bytes = min(src_bv.bv_len, dst_bv.bv_len);
++
++		src_p = kmap_atomic(src_bv.bv_page);
++		dst_p = kmap_atomic(dst_bv.bv_page);
++
++		memcpy(dst_p + dst_bv.bv_offset,
++		       src_p + src_bv.bv_offset,
++		       bytes);
+ 
+-		memcpy(dst_buf, src_buf, bytes);
++		kunmap_atomic(dst_p);
++		kunmap_atomic(src_p);
+ 
+-		kunmap_local(dst_buf);
+-		kunmap_local(src_buf);
++		flush_dcache_page(dst_bv.bv_page);
+ 
+ 		bio_advance_iter_single(src, src_iter, bytes);
+ 		bio_advance_iter_single(dst, dst_iter, bytes);
+@@ -1447,6 +1457,7 @@ void bio_set_pages_dirty(struct bio *bio)
+ 			set_page_dirty_lock(bvec->bv_page);
+ 	}
+ }
++EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
+ 
+ /*
+  * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
+@@ -1506,6 +1517,7 @@ void bio_check_pages_dirty(struct bio *bio)
+ 	spin_unlock_irqrestore(&bio_dirty_lock, flags);
+ 	schedule_work(&bio_dirty_work);
+ }
++EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
+ 
+ static inline bool bio_remaining_done(struct bio *bio)
+ {
+diff --git a/block/blk-core.c b/block/blk-core.c
+index 27fb1357ad4b..7697abda9fad 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -207,6 +207,7 @@ const char *blk_status_to_str(blk_status_t status)
+ 		return "<null>";
+ 	return blk_errors[idx].name;
+ }
++EXPORT_SYMBOL_GPL(blk_status_to_str);
+ 
+ /**
+  * blk_sync_queue - cancel any pending callbacks on a queue
+diff --git a/block/blk.h b/block/blk.h
+index 434017701403..066fd89c916b 100644
+--- a/block/blk.h
++++ b/block/blk.h
+@@ -240,7 +240,6 @@ static inline void blk_integrity_del(struct gendisk *disk)
+ 
+ unsigned long blk_rq_timeout(unsigned long timeout);
+ void blk_add_timer(struct request *req);
+-const char *blk_status_to_str(blk_status_t status);
+ 
+ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+ 		unsigned int nr_segs);
+diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c
+index 8bc71cdc2270..370993c9c381 100644
+--- a/drivers/acpi/apei/erst-dbg.c
++++ b/drivers/acpi/apei/erst-dbg.c
+@@ -11,6 +11,7 @@
+  *   Author: Huang Ying <ying.huang@intel.com>
+  */
+ 
++#include <linux/fs.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ #include <linux/uaccess.h>
+diff --git a/drivers/block/loop.c b/drivers/block/loop.c
+index 084f9b8a0ba3..7a420623ac38 100644
+--- a/drivers/block/loop.c
++++ b/drivers/block/loop.c
+@@ -1166,8 +1166,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
+ 	if (!release)
+ 		blk_mq_unfreeze_queue(lo->lo_queue);
+ 
+-	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
+-
+ 	if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
+ 		int err;
+ 
+diff --git a/drivers/clk/tegra/clk-bpmp.c b/drivers/clk/tegra/clk-bpmp.c
+index 3748a39dae7c..7e3b48ed9d45 100644
+--- a/drivers/clk/tegra/clk-bpmp.c
++++ b/drivers/clk/tegra/clk-bpmp.c
+@@ -5,7 +5,7 @@
+ 
+ #include <linux/clk-provider.h>
+ #include <linux/device.h>
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ #include <linux/slab.h>
+ 
+ #include <soc/tegra/bpmp.h>
+@@ -365,39 +365,38 @@ static void tegra_bpmp_clk_info_dump(struct tegra_bpmp *bpmp,
+ 				     const struct tegra_bpmp_clk_info *info)
+ {
+ 	const char *prefix = "";
+-	struct seq_buf buf;
++	struct printbuf buf = PRINTBUF;
+ 	unsigned int i;
+-	char flags[64];
+-
+-	seq_buf_init(&buf, flags, sizeof(flags));
+ 
+ 	if (info->flags)
+-		seq_buf_printf(&buf, "(");
++		prt_printf(&buf, "(");
+ 
+ 	if (info->flags & TEGRA_BPMP_CLK_HAS_MUX) {
+-		seq_buf_printf(&buf, "%smux", prefix);
++		prt_printf(&buf, "%smux", prefix);
+ 		prefix = ", ";
+ 	}
+ 
+ 	if ((info->flags & TEGRA_BPMP_CLK_HAS_SET_RATE) == 0) {
+-		seq_buf_printf(&buf, "%sfixed", prefix);
++		prt_printf(&buf, "%sfixed", prefix);
+ 		prefix = ", ";
+ 	}
+ 
+ 	if (info->flags & TEGRA_BPMP_CLK_IS_ROOT) {
+-		seq_buf_printf(&buf, "%sroot", prefix);
++		prt_printf(&buf, "%sroot", prefix);
+ 		prefix = ", ";
+ 	}
+ 
+ 	if (info->flags)
+-		seq_buf_printf(&buf, ")");
++		prt_printf(&buf, ")");
+ 
+ 	dev_printk(level, bpmp->dev, "%03u: %s\n", info->id, info->name);
+-	dev_printk(level, bpmp->dev, "  flags: %lx %s\n", info->flags, flags);
++	dev_printk(level, bpmp->dev, "  flags: %lx %s\n", info->flags, printbuf_str(&buf));
+ 	dev_printk(level, bpmp->dev, "  parents: %u\n", info->num_parents);
+ 
+ 	for (i = 0; i < info->num_parents; i++)
+ 		dev_printk(level, bpmp->dev, "    %03u\n", info->parents[i]);
++
++	printbuf_exit(&buf);
+ }
+ 
+ static int tegra_bpmp_probe_clocks(struct tegra_bpmp *bpmp,
+diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
+index 3088c5b829f0..a8c5f90e8208 100644
+--- a/drivers/input/joystick/analog.c
++++ b/drivers/input/joystick/analog.c
+@@ -19,7 +19,7 @@
+ #include <linux/input.h>
+ #include <linux/gameport.h>
+ #include <linux/jiffies.h>
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ #include <linux/timex.h>
+ #include <linux/timekeeping.h>
+ 
+@@ -339,24 +339,21 @@ static void analog_calibrate_timer(struct analog_port *port)
+ 
+ static void analog_name(struct analog *analog)
+ {
+-	struct seq_buf s;
++	struct printbuf buf = PRINTBUF_EXTERN(analog->name, sizeof(analog->name));
+ 
+-	seq_buf_init(&s, analog->name, sizeof(analog->name));
+-	seq_buf_printf(&s, "Analog %d-axis %d-button",
+-		 hweight8(analog->mask & ANALOG_AXES_STD),
+-		 hweight8(analog->mask & ANALOG_BTNS_STD) + !!(analog->mask & ANALOG_BTNS_CHF) * 2 +
+-		 hweight16(analog->mask & ANALOG_BTNS_GAMEPAD) + !!(analog->mask & ANALOG_HBTN_CHF) * 4);
++	prt_printf(&buf, "Analog %d-axis %d-button",
++	       hweight8(analog->mask & ANALOG_AXES_STD),
++	       hweight8(analog->mask & ANALOG_BTNS_STD) + !!(analog->mask & ANALOG_BTNS_CHF) * 2 +
++	       hweight16(analog->mask & ANALOG_BTNS_GAMEPAD) + !!(analog->mask & ANALOG_HBTN_CHF) * 4);
+ 
+ 	if (analog->mask & ANALOG_HATS_ALL)
+-		seq_buf_printf(&s, " %d-hat",
+-			       hweight16(analog->mask & ANALOG_HATS_ALL));
+-
++		prt_printf(&buf, " %d-hat", hweight16(analog->mask & ANALOG_HATS_ALL));
+ 	if (analog->mask & ANALOG_HAT_FCS)
+-		seq_buf_printf(&s, " FCS");
++		prt_printf(&buf, " FCS");
+ 	if (analog->mask & ANALOG_ANY_CHF)
+-		seq_buf_printf(&s, (analog->mask & ANALOG_SAITEK) ? " Saitek" : " CHF");
++		prt_printf(&buf, (analog->mask & ANALOG_SAITEK) ? " Saitek" : " CHF");
+ 
+-	seq_buf_printf(&s, (analog->mask & ANALOG_GAMEPAD) ? " gamepad" : " joystick");
++	prt_printf(&buf, (analog->mask & ANALOG_GAMEPAD) ? " gamepad" : " joystick");
+ }
+ 
+ /*
+diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
+index cf3e8096942a..f1a1f0c4a0ea 100644
+--- a/drivers/md/bcache/Kconfig
++++ b/drivers/md/bcache/Kconfig
+@@ -4,6 +4,7 @@ config BCACHE
+ 	tristate "Block device as cache"
+ 	select BLOCK_HOLDER_DEPRECATED if SYSFS
+ 	select CRC64
++	select CLOSURES
+ 	help
+ 	Allows a block device to be used as cache for other devices; uses
+ 	a btree for indexing and the layout is optimized for SSDs.
+@@ -19,15 +20,6 @@ config BCACHE_DEBUG
+ 	Enables extra debugging tools, allows expensive runtime checks to be
+ 	turned on.
+ 
+-config BCACHE_CLOSURES_DEBUG
+-	bool "Debug closures"
+-	depends on BCACHE
+-	select DEBUG_FS
+-	help
+-	Keeps all active closures in a linked list and provides a debugfs
+-	interface to list them, which makes it possible to see asynchronous
+-	operations that get stuck.
+-
+ config BCACHE_ASYNC_REGISTRATION
+ 	bool "Asynchronous device registration (EXPERIMENTAL)"
+ 	depends on BCACHE
+diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
+index 5b87e59676b8..054e8a33a7ab 100644
+--- a/drivers/md/bcache/Makefile
++++ b/drivers/md/bcache/Makefile
+@@ -2,6 +2,6 @@
+ 
+ obj-$(CONFIG_BCACHE)	+= bcache.o
+ 
+-bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
+-	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
++bcache-y		:= alloc.o bset.o btree.o debug.o extents.o io.o\
++	journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ 	util.o writeback.o features.o
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 2acda9cea0f9..bf96b3e6b6eb 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -179,6 +179,7 @@
+ #define pr_fmt(fmt) "bcache: %s() " fmt, __func__
+ 
+ #include <linux/bio.h>
++#include <linux/closure.h>
+ #include <linux/kobject.h>
+ #include <linux/list.h>
+ #include <linux/mutex.h>
+@@ -192,7 +193,6 @@
+ #include "bcache_ondisk.h"
+ #include "bset.h"
+ #include "util.h"
+-#include "closure.h"
+ 
+ struct bucket {
+ 	atomic_t	pin;
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 3563d15dbaf2..9249aba333bc 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2913,7 +2913,6 @@ static int __init bcache_init(void)
+ 		goto err;
+ 
+ 	bch_debug_init();
+-	closure_debug_init();
+ 
+ 	bcache_is_reboot = false;
+ 
+diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
+index 6f3cb7c92130..f61ab1bada6c 100644
+--- a/drivers/md/bcache/util.h
++++ b/drivers/md/bcache/util.h
+@@ -4,6 +4,7 @@
+ #define _BCACHE_UTIL_H
+ 
+ #include <linux/blkdev.h>
++#include <linux/closure.h>
+ #include <linux/errno.h>
+ #include <linux/kernel.h>
+ #include <linux/sched/clock.h>
+@@ -13,8 +14,6 @@
+ #include <linux/workqueue.h>
+ #include <linux/crc64.h>
+ 
+-#include "closure.h"
+-
+ struct closure;
+ 
+ #ifdef CONFIG_BCACHE_DEBUG
+diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
+index 462b429ad243..f06328035b9c 100644
+--- a/drivers/pci/p2pdma.c
++++ b/drivers/pci/p2pdma.c
+@@ -17,7 +17,7 @@
+ #include <linux/memremap.h>
+ #include <linux/percpu-refcount.h>
+ #include <linux/random.h>
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ #include <linux/xarray.h>
+ 
+ enum pci_p2pdma_map_type {
+@@ -281,12 +281,9 @@ static int pci_bridge_has_acs_redir(struct pci_dev *pdev)
+ 	return 0;
+ }
+ 
+-static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev)
++static void prt_bus_devfn(struct printbuf *buf, struct pci_dev *pdev)
+ {
+-	if (!buf)
+-		return;
+-
+-	seq_buf_printf(buf, "%s;", pci_name(pdev));
++	prt_printf(buf, "%s;", pci_name(pdev));
+ }
+ 
+ static bool cpu_supports_p2pdma(void)
+@@ -460,13 +457,11 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
+ 	struct pci_dev *a = provider, *b = client, *bb;
+ 	bool acs_redirects = false;
+ 	struct pci_p2pdma *p2pdma;
+-	struct seq_buf acs_list;
+ 	int acs_cnt = 0;
+ 	int dist_a = 0;
+ 	int dist_b = 0;
+ 	char buf[128];
+-
+-	seq_buf_init(&acs_list, buf, sizeof(buf));
++	struct printbuf acs_list = PRINTBUF_EXTERN(buf, sizeof(buf));
+ 
+ 	/*
+ 	 * Note, we don't need to take references to devices returned by
+@@ -477,7 +472,7 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
+ 		dist_b = 0;
+ 
+ 		if (pci_bridge_has_acs_redir(a)) {
+-			seq_buf_print_bus_devfn(&acs_list, a);
++			prt_bus_devfn(&acs_list, a);
+ 			acs_cnt++;
+ 		}
+ 
+@@ -506,7 +501,7 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
+ 			break;
+ 
+ 		if (pci_bridge_has_acs_redir(bb)) {
+-			seq_buf_print_bus_devfn(&acs_list, bb);
++			prt_bus_devfn(&acs_list, bb);
+ 			acs_cnt++;
+ 		}
+ 
+@@ -521,11 +516,11 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
+ 	}
+ 
+ 	if (verbose) {
+-		acs_list.buffer[acs_list.len-1] = 0; /* drop final semicolon */
++		acs_list.buf[acs_list.pos-1] = 0; /* drop final semicolon */
+ 		pci_warn(client, "ACS redirect is set between the client and provider (%s)\n",
+ 			 pci_name(provider));
+ 		pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n",
+-			 acs_list.buffer);
++			 acs_list.buf);
+ 	}
+ 	acs_redirects = true;
+ 
+diff --git a/fs/Kconfig b/fs/Kconfig
+index 5976eb33535f..6d2c4231494a 100644
+--- a/fs/Kconfig
++++ b/fs/Kconfig
+@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig"
+ source "fs/btrfs/Kconfig"
+ source "fs/nilfs2/Kconfig"
+ source "fs/f2fs/Kconfig"
++source "fs/bcachefs/Kconfig"
+ source "fs/zonefs/Kconfig"
+ 
+ endif # BLOCK
+diff --git a/fs/Makefile b/fs/Makefile
+index 208a74e0b00e..5d5c8c792058 100644
+--- a/fs/Makefile
++++ b/fs/Makefile
+@@ -134,6 +134,7 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
+ obj-$(CONFIG_BTRFS_FS)		+= btrfs/
+ obj-$(CONFIG_GFS2_FS)           += gfs2/
+ obj-$(CONFIG_F2FS_FS)		+= f2fs/
++obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs/
+ obj-$(CONFIG_CEPH_FS)		+= ceph/
+ obj-$(CONFIG_PSTORE)		+= pstore/
+ obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
+diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
+new file mode 100644
+index 000000000000..008886967841
+--- /dev/null
++++ b/fs/bcachefs/Kconfig
+@@ -0,0 +1,59 @@
++
++config BCACHEFS_FS
++	tristate "bcachefs filesystem support"
++	depends on BLOCK
++	select EXPORTFS
++	select CLOSURES
++	select LIBCRC32C
++	select CRC64
++	select FS_POSIX_ACL
++	select LZ4_COMPRESS
++	select LZ4_DECOMPRESS
++	select ZLIB_DEFLATE
++	select ZLIB_INFLATE
++	select ZSTD_COMPRESS
++	select ZSTD_DECOMPRESS
++	select CRYPTO_SHA256
++	select CRYPTO_CHACHA20
++	select CRYPTO_POLY1305
++	select KEYS
++	select SIXLOCKS
++	select RAID6_PQ
++	select XOR_BLOCKS
++	select XXHASH
++	select SRCU
++	select SYMBOLIC_ERRNAME
++	help
++	The bcachefs filesystem - a modern, copy on write filesystem, with
++	support for multiple devices, compression, checksumming, etc.
++
++config BCACHEFS_QUOTA
++	bool "bcachefs quota support"
++	depends on BCACHEFS_FS
++	select QUOTACTL
++
++config BCACHEFS_POSIX_ACL
++	bool "bcachefs POSIX ACL support"
++	depends on BCACHEFS_FS
++	select FS_POSIX_ACL
++
++config BCACHEFS_DEBUG
++	bool "bcachefs debugging"
++	depends on BCACHEFS_FS
++	help
++	Enables many extra debugging checks and assertions.
++
++	The resulting code will be significantly slower than normal; you
++	probably shouldn't select this option unless you're a developer.
++
++config BCACHEFS_TESTS
++	bool "bcachefs unit and performance tests"
++	depends on BCACHEFS_FS
++	help
++	Include some unit and performance tests for the core btree code
++
++config BCACHEFS_LOCK_TIME_STATS
++       bool "bcachefs lock time statistics"
++       depends on BCACHEFS_FS
++       help
++       Expose statistics for how long we held a lock in debugfs
+diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
+new file mode 100644
+index 000000000000..5dad8ed03a20
+--- /dev/null
++++ b/fs/bcachefs/Makefile
+@@ -0,0 +1,69 @@
++
++obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
++
++bcachefs-y		:=	\
++	alloc_background.o	\
++	alloc_foreground.o	\
++	backpointers.o		\
++	bkey.o			\
++	bkey_methods.o		\
++	bkey_sort.o		\
++	bset.o			\
++	btree_cache.o		\
++	btree_gc.o		\
++	btree_io.o		\
++	btree_iter.o		\
++	btree_key_cache.o	\
++	btree_update_interior.o	\
++	btree_update_leaf.o	\
++	buckets.o		\
++	buckets_waiting_for_journal.o	\
++	chardev.o		\
++	checksum.o		\
++	clock.o			\
++	compress.o		\
++	counters.o		\
++	debug.o			\
++	dirent.o		\
++	disk_groups.o		\
++	data_update.o		\
++	ec.o			\
++	errcode.o		\
++	error.o			\
++	extents.o		\
++	extent_update.o		\
++	fs.o			\
++	fs-common.o		\
++	fs-ioctl.o		\
++	fs-io.o			\
++	fsck.o			\
++	inode.o			\
++	io.o			\
++	journal.o		\
++	journal_io.o		\
++	journal_reclaim.o	\
++	journal_sb.o		\
++	journal_seq_blacklist.o	\
++	keylist.o		\
++	lru.o			\
++	migrate.o		\
++	move.o			\
++	movinggc.o		\
++	opts.o			\
++	quota.o			\
++	rebalance.o		\
++	recovery.o		\
++	reflink.o		\
++	replicas.o		\
++	siphash.o		\
++	subvolume.o		\
++	super.o			\
++	super-io.o		\
++	sysfs.o			\
++	tests.o			\
++	trace.o			\
++	util.o			\
++	varint.o		\
++	xattr.o
++
++bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o
+diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
+new file mode 100644
+index 000000000000..5c6ccf685094
+--- /dev/null
++++ b/fs/bcachefs/acl.c
+@@ -0,0 +1,406 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifdef CONFIG_BCACHEFS_POSIX_ACL
++
++#include "bcachefs.h"
++
++#include <linux/fs.h>
++#include <linux/posix_acl.h>
++#include <linux/posix_acl_xattr.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++
++#include "acl.h"
++#include "fs.h"
++#include "xattr.h"
++
++static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
++{
++	return sizeof(bch_acl_header) +
++		sizeof(bch_acl_entry_short) * nr_short +
++		sizeof(bch_acl_entry) * nr_long;
++}
++
++static inline int acl_to_xattr_type(int type)
++{
++	switch (type) {
++	case ACL_TYPE_ACCESS:
++		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
++	case ACL_TYPE_DEFAULT:
++		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
++	default:
++		BUG();
++	}
++}
++
++/*
++ * Convert from filesystem to in-memory representation.
++ */
++static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
++{
++	const void *p, *end = value + size;
++	struct posix_acl *acl;
++	struct posix_acl_entry *out;
++	unsigned count = 0;
++
++	if (!value)
++		return NULL;
++	if (size < sizeof(bch_acl_header))
++		goto invalid;
++	if (((bch_acl_header *)value)->a_version !=
++	    cpu_to_le32(BCH_ACL_VERSION))
++		goto invalid;
++
++	p = value + sizeof(bch_acl_header);
++	while (p < end) {
++		const bch_acl_entry *entry = p;
++
++		if (p + sizeof(bch_acl_entry_short) > end)
++			goto invalid;
++
++		switch (le16_to_cpu(entry->e_tag)) {
++		case ACL_USER_OBJ:
++		case ACL_GROUP_OBJ:
++		case ACL_MASK:
++		case ACL_OTHER:
++			p += sizeof(bch_acl_entry_short);
++			break;
++		case ACL_USER:
++		case ACL_GROUP:
++			p += sizeof(bch_acl_entry);
++			break;
++		default:
++			goto invalid;
++		}
++
++		count++;
++	}
++
++	if (p > end)
++		goto invalid;
++
++	if (!count)
++		return NULL;
++
++	acl = posix_acl_alloc(count, GFP_KERNEL);
++	if (!acl)
++		return ERR_PTR(-ENOMEM);
++
++	out = acl->a_entries;
++
++	p = value + sizeof(bch_acl_header);
++	while (p < end) {
++		const bch_acl_entry *in = p;
++
++		out->e_tag  = le16_to_cpu(in->e_tag);
++		out->e_perm = le16_to_cpu(in->e_perm);
++
++		switch (out->e_tag) {
++		case ACL_USER_OBJ:
++		case ACL_GROUP_OBJ:
++		case ACL_MASK:
++		case ACL_OTHER:
++			p += sizeof(bch_acl_entry_short);
++			break;
++		case ACL_USER:
++			out->e_uid = make_kuid(&init_user_ns,
++					       le32_to_cpu(in->e_id));
++			p += sizeof(bch_acl_entry);
++			break;
++		case ACL_GROUP:
++			out->e_gid = make_kgid(&init_user_ns,
++					       le32_to_cpu(in->e_id));
++			p += sizeof(bch_acl_entry);
++			break;
++		}
++
++		out++;
++	}
++
++	BUG_ON(out != acl->a_entries + acl->a_count);
++
++	return acl;
++invalid:
++	pr_err("invalid acl entry");
++	return ERR_PTR(-EINVAL);
++}
++
++#define acl_for_each_entry(acl, acl_e)			\
++	for (acl_e = acl->a_entries;			\
++	     acl_e < acl->a_entries + acl->a_count;	\
++	     acl_e++)
++
++/*
++ * Convert from in-memory to filesystem representation.
++ */
++static struct bkey_i_xattr *
++bch2_acl_to_xattr(struct btree_trans *trans,
++		  const struct posix_acl *acl,
++		  int type)
++{
++	struct bkey_i_xattr *xattr;
++	bch_acl_header *acl_header;
++	const struct posix_acl_entry *acl_e;
++	void *outptr;
++	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
++
++	acl_for_each_entry(acl, acl_e) {
++		switch (acl_e->e_tag) {
++		case ACL_USER:
++		case ACL_GROUP:
++			nr_long++;
++			break;
++		case ACL_USER_OBJ:
++		case ACL_GROUP_OBJ:
++		case ACL_MASK:
++		case ACL_OTHER:
++			nr_short++;
++			break;
++		default:
++			return ERR_PTR(-EINVAL);
++		}
++	}
++
++	acl_len = bch2_acl_size(nr_short, nr_long);
++	u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
++
++	if (u64s > U8_MAX)
++		return ERR_PTR(-E2BIG);
++
++	xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
++	if (IS_ERR(xattr))
++		return xattr;
++
++	bkey_xattr_init(&xattr->k_i);
++	xattr->k.u64s		= u64s;
++	xattr->v.x_type		= acl_to_xattr_type(type);
++	xattr->v.x_name_len	= 0,
++	xattr->v.x_val_len	= cpu_to_le16(acl_len);
++
++	acl_header = xattr_val(&xattr->v);
++	acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
++
++	outptr = (void *) acl_header + sizeof(*acl_header);
++
++	acl_for_each_entry(acl, acl_e) {
++		bch_acl_entry *entry = outptr;
++
++		entry->e_tag = cpu_to_le16(acl_e->e_tag);
++		entry->e_perm = cpu_to_le16(acl_e->e_perm);
++		switch (acl_e->e_tag) {
++		case ACL_USER:
++			entry->e_id = cpu_to_le32(
++				from_kuid(&init_user_ns, acl_e->e_uid));
++			outptr += sizeof(bch_acl_entry);
++			break;
++		case ACL_GROUP:
++			entry->e_id = cpu_to_le32(
++				from_kgid(&init_user_ns, acl_e->e_gid));
++			outptr += sizeof(bch_acl_entry);
++			break;
++
++		case ACL_USER_OBJ:
++		case ACL_GROUP_OBJ:
++		case ACL_MASK:
++		case ACL_OTHER:
++			outptr += sizeof(bch_acl_entry_short);
++			break;
++		}
++	}
++
++	BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
++
++	return xattr;
++}
++
++struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
++{
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
++	struct btree_trans trans;
++	struct btree_iter iter = { NULL };
++	struct bkey_s_c_xattr xattr;
++	struct posix_acl *acl = NULL;
++	struct bkey_s_c k;
++	int ret;
++
++	if (rcu)
++		return ERR_PTR(-ECHILD);
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
++			&hash, inode_inum(inode),
++			&X_SEARCH(acl_to_xattr_type(type), "", 0),
++			0);
++	if (ret) {
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			goto retry;
++		if (ret != -ENOENT)
++			acl = ERR_PTR(ret);
++		goto out;
++	}
++
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret) {
++		acl = ERR_PTR(ret);
++		goto out;
++	}
++
++	xattr = bkey_s_c_to_xattr(k);
++	acl = bch2_acl_from_disk(xattr_val(xattr.v),
++			le16_to_cpu(xattr.v->x_val_len));
++
++	if (!IS_ERR(acl))
++		set_cached_acl(&inode->v, type, acl);
++out:
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return acl;
++}
++
++int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
++		       struct bch_inode_unpacked *inode_u,
++		       struct posix_acl *acl, int type)
++{
++	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
++	int ret;
++
++	if (type == ACL_TYPE_DEFAULT &&
++	    !S_ISDIR(inode_u->bi_mode))
++		return acl ? -EACCES : 0;
++
++	if (acl) {
++		struct bkey_i_xattr *xattr =
++			bch2_acl_to_xattr(trans, acl, type);
++		if (IS_ERR(xattr))
++			return PTR_ERR(xattr);
++
++		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
++				    inum, &xattr->k_i, 0);
++	} else {
++		struct xattr_search_key search =
++			X_SEARCH(acl_to_xattr_type(type), "", 0);
++
++		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
++				       inum, &search);
++	}
++
++	return ret == -ENOENT ? 0 : ret;
++}
++
++int bch2_set_acl(struct user_namespace *mnt_userns,
++		 struct inode *vinode, struct posix_acl *_acl, int type)
++{
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct btree_trans trans;
++	struct btree_iter inode_iter = { NULL };
++	struct bch_inode_unpacked inode_u;
++	struct posix_acl *acl;
++	umode_t mode;
++	int ret;
++
++	mutex_lock(&inode->ei_update_lock);
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++	acl = _acl;
++
++	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
++			      BTREE_ITER_INTENT);
++	if (ret)
++		goto btree_err;
++
++	mode = inode_u.bi_mode;
++
++	if (type == ACL_TYPE_ACCESS) {
++		ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl);
++		if (ret)
++			goto btree_err;
++	}
++
++	ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
++	if (ret)
++		goto btree_err;
++
++	inode_u.bi_ctime	= bch2_current_time(c);
++	inode_u.bi_mode		= mode;
++
++	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
++		bch2_trans_commit(&trans, NULL, NULL, 0);
++btree_err:
++	bch2_trans_iter_exit(&trans, &inode_iter);
++
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++	if (unlikely(ret))
++		goto err;
++
++	bch2_inode_update_after_write(&trans, inode, &inode_u,
++				      ATTR_CTIME|ATTR_MODE);
++
++	set_cached_acl(&inode->v, type, acl);
++err:
++	bch2_trans_exit(&trans);
++	mutex_unlock(&inode->ei_update_lock);
++
++	return ret;
++}
++
++int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
++		   struct bch_inode_unpacked *inode,
++		   umode_t mode,
++		   struct posix_acl **new_acl)
++{
++	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
++	struct btree_iter iter;
++	struct bkey_s_c_xattr xattr;
++	struct bkey_i_xattr *new;
++	struct posix_acl *acl;
++	struct bkey_s_c k;
++	int ret;
++
++	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
++			       &hash_info, inum,
++			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
++			BTREE_ITER_INTENT);
++	if (ret)
++		return ret == -ENOENT ? 0 : ret;
++
++	k = bch2_btree_iter_peek_slot(&iter);
++	xattr = bkey_s_c_to_xattr(k);
++	if (ret)
++		goto err;
++
++	acl = bch2_acl_from_disk(xattr_val(xattr.v),
++			le16_to_cpu(xattr.v->x_val_len));
++	ret = PTR_ERR_OR_ZERO(acl);
++	if (IS_ERR_OR_NULL(acl))
++		goto err;
++
++	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
++	if (ret)
++		goto err;
++
++	new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
++	if (IS_ERR(new)) {
++		ret = PTR_ERR(new);
++		goto err;
++	}
++
++	new->k.p = iter.pos;
++	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
++	*new_acl = acl;
++	acl = NULL;
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	if (!IS_ERR_OR_NULL(acl))
++		kfree(acl);
++	return ret;
++}
++
++#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
+new file mode 100644
+index 000000000000..2d76a4897ba8
+--- /dev/null
++++ b/fs/bcachefs/acl.h
+@@ -0,0 +1,58 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_ACL_H
++#define _BCACHEFS_ACL_H
++
++struct bch_inode_unpacked;
++struct bch_hash_info;
++struct bch_inode_info;
++struct posix_acl;
++
++#ifdef CONFIG_BCACHEFS_POSIX_ACL
++
++#define BCH_ACL_VERSION	0x0001
++
++typedef struct {
++	__le16		e_tag;
++	__le16		e_perm;
++	__le32		e_id;
++} bch_acl_entry;
++
++typedef struct {
++	__le16		e_tag;
++	__le16		e_perm;
++} bch_acl_entry_short;
++
++typedef struct {
++	__le32		a_version;
++} bch_acl_header;
++
++struct posix_acl *bch2_get_acl(struct inode *, int, bool);
++
++int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
++		       struct bch_inode_unpacked *,
++		       struct posix_acl *, int);
++int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int);
++int bch2_acl_chmod(struct btree_trans *, subvol_inum,
++		   struct bch_inode_unpacked *,
++		   umode_t, struct posix_acl **);
++
++#else
++
++static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
++				     struct bch_inode_unpacked *inode_u,
++				     struct posix_acl *acl, int type)
++{
++	return 0;
++}
++
++static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
++				 struct bch_inode_unpacked *inode,
++				 umode_t mode,
++				 struct posix_acl **new_acl)
++{
++	return 0;
++}
++
++#endif /* CONFIG_BCACHEFS_POSIX_ACL */
++
++#endif /* _BCACHEFS_ACL_H */
+diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
+new file mode 100644
+index 000000000000..cd6cbd2064ee
+--- /dev/null
++++ b/fs/bcachefs/alloc_background.c
+@@ -0,0 +1,1552 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "alloc_foreground.h"
++#include "backpointers.h"
++#include "btree_cache.h"
++#include "btree_io.h"
++#include "btree_key_cache.h"
++#include "btree_update.h"
++#include "btree_update_interior.h"
++#include "btree_gc.h"
++#include "buckets.h"
++#include "buckets_waiting_for_journal.h"
++#include "clock.h"
++#include "debug.h"
++#include "ec.h"
++#include "error.h"
++#include "lru.h"
++#include "recovery.h"
++#include "varint.h"
++
++#include <linux/kthread.h>
++#include <linux/math64.h>
++#include <linux/random.h>
++#include <linux/rculist.h>
++#include <linux/rcupdate.h>
++#include <linux/sched/task.h>
++#include <linux/sort.h>
++#include <trace/events/bcachefs.h>
++
++/* Persistent alloc info: */
++
++static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
++#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
++	BCH_ALLOC_FIELDS_V1()
++#undef x
++};
++
++struct bkey_alloc_unpacked {
++	u64		journal_seq;
++	u8		gen;
++	u8		oldest_gen;
++	u8		data_type;
++	bool		need_discard:1;
++	bool		need_inc_gen:1;
++#define x(_name, _bits)	u##_bits _name;
++	BCH_ALLOC_FIELDS_V2()
++#undef  x
++};
++
++static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
++				     const void **p, unsigned field)
++{
++	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
++	u64 v;
++
++	if (!(a->fields & (1 << field)))
++		return 0;
++
++	switch (bytes) {
++	case 1:
++		v = *((const u8 *) *p);
++		break;
++	case 2:
++		v = le16_to_cpup(*p);
++		break;
++	case 4:
++		v = le32_to_cpup(*p);
++		break;
++	case 8:
++		v = le64_to_cpup(*p);
++		break;
++	default:
++		BUG();
++	}
++
++	*p += bytes;
++	return v;
++}
++
++static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
++				      unsigned field, u64 v)
++{
++	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
++
++	if (!v)
++		return;
++
++	a->v.fields |= 1 << field;
++
++	switch (bytes) {
++	case 1:
++		*((u8 *) *p) = v;
++		break;
++	case 2:
++		*((__le16 *) *p) = cpu_to_le16(v);
++		break;
++	case 4:
++		*((__le32 *) *p) = cpu_to_le32(v);
++		break;
++	case 8:
++		*((__le64 *) *p) = cpu_to_le64(v);
++		break;
++	default:
++		BUG();
++	}
++
++	*p += bytes;
++}
++
++static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
++				 struct bkey_s_c k)
++{
++	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
++	const void *d = in->data;
++	unsigned idx = 0;
++
++	out->gen = in->gen;
++
++#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
++	BCH_ALLOC_FIELDS_V1()
++#undef  x
++}
++
++static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
++				struct bkey_s_c k)
++{
++	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
++	const u8 *in = a.v->data;
++	const u8 *end = bkey_val_end(a);
++	unsigned fieldnr = 0;
++	int ret;
++	u64 v;
++
++	out->gen	= a.v->gen;
++	out->oldest_gen	= a.v->oldest_gen;
++	out->data_type	= a.v->data_type;
++
++#define x(_name, _bits)							\
++	if (fieldnr < a.v->nr_fields) {					\
++		ret = bch2_varint_decode_fast(in, end, &v);		\
++		if (ret < 0)						\
++			return ret;					\
++		in += ret;						\
++	} else {							\
++		v = 0;							\
++	}								\
++	out->_name = v;							\
++	if (v != out->_name)						\
++		return -1;						\
++	fieldnr++;
++
++	BCH_ALLOC_FIELDS_V2()
++#undef  x
++	return 0;
++}
++
++static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
++				struct bkey_s_c k)
++{
++	struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
++	const u8 *in = a.v->data;
++	const u8 *end = bkey_val_end(a);
++	unsigned fieldnr = 0;
++	int ret;
++	u64 v;
++
++	out->gen	= a.v->gen;
++	out->oldest_gen	= a.v->oldest_gen;
++	out->data_type	= a.v->data_type;
++	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
++	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
++	out->journal_seq = le64_to_cpu(a.v->journal_seq);
++
++#define x(_name, _bits)							\
++	if (fieldnr < a.v->nr_fields) {					\
++		ret = bch2_varint_decode_fast(in, end, &v);		\
++		if (ret < 0)						\
++			return ret;					\
++		in += ret;						\
++	} else {							\
++		v = 0;							\
++	}								\
++	out->_name = v;							\
++	if (v != out->_name)						\
++		return -1;						\
++	fieldnr++;
++
++	BCH_ALLOC_FIELDS_V2()
++#undef  x
++	return 0;
++}
++
++static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
++{
++	struct bkey_alloc_unpacked ret = { .gen	= 0 };
++
++	switch (k.k->type) {
++	case KEY_TYPE_alloc:
++		bch2_alloc_unpack_v1(&ret, k);
++		break;
++	case KEY_TYPE_alloc_v2:
++		bch2_alloc_unpack_v2(&ret, k);
++		break;
++	case KEY_TYPE_alloc_v3:
++		bch2_alloc_unpack_v3(&ret, k);
++		break;
++	}
++
++	return ret;
++}
++
++struct bkey_i_alloc_v4 *
++bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
++			      struct bpos pos)
++{
++	struct bkey_s_c k;
++	struct bkey_i_alloc_v4 *a;
++	int ret;
++
++	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
++			     BTREE_ITER_WITH_UPDATES|
++			     BTREE_ITER_CACHED|
++			     BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (ret) {
++		bch2_trans_iter_exit(trans, iter);
++		return ERR_PTR(ret);
++	}
++
++	a = bch2_alloc_to_v4_mut(trans, k);
++	if (IS_ERR(a))
++		bch2_trans_iter_exit(trans, iter);
++	return a;
++}
++
++static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
++{
++	unsigned i, bytes = offsetof(struct bch_alloc, data);
++
++	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
++		if (a->fields & (1 << i))
++			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
++
++	return DIV_ROUND_UP(bytes, sizeof(u64));
++}
++
++int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			  int rw, struct printbuf *err)
++{
++	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
++
++	/* allow for unknown fields */
++	if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
++		prt_printf(err, "incorrect value size (%zu < %u)",
++		       bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			  int rw, struct printbuf *err)
++{
++	struct bkey_alloc_unpacked u;
++
++	if (bch2_alloc_unpack_v2(&u, k)) {
++		prt_printf(err, "unpack error");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			  int rw, struct printbuf *err)
++{
++	struct bkey_alloc_unpacked u;
++
++	if (bch2_alloc_unpack_v3(&u, k)) {
++		prt_printf(err, "unpack error");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			  int rw, struct printbuf *err)
++{
++	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
++
++	if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
++		prt_printf(err, "bad val size (%lu != %u)",
++		       bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
++		return -EINVAL;
++	}
++
++	if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
++	    BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
++		prt_printf(err, "invalid backpointers_start");
++		return -EINVAL;
++	}
++
++	if (rw == WRITE) {
++		if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
++			prt_printf(err, "invalid data type (got %u should be %u)",
++			       a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
++			return -EINVAL;
++		}
++
++		switch (a.v->data_type) {
++		case BCH_DATA_free:
++		case BCH_DATA_need_gc_gens:
++		case BCH_DATA_need_discard:
++			if (a.v->dirty_sectors ||
++			    a.v->cached_sectors ||
++			    a.v->stripe) {
++				prt_printf(err, "empty data type free but have data");
++				return -EINVAL;
++			}
++			break;
++		case BCH_DATA_sb:
++		case BCH_DATA_journal:
++		case BCH_DATA_btree:
++		case BCH_DATA_user:
++		case BCH_DATA_parity:
++			if (!a.v->dirty_sectors) {
++				prt_printf(err, "data_type %s but dirty_sectors==0",
++				       bch2_data_types[a.v->data_type]);
++				return -EINVAL;
++			}
++			break;
++		case BCH_DATA_cached:
++			if (!a.v->cached_sectors ||
++			    a.v->dirty_sectors ||
++			    a.v->stripe) {
++				prt_printf(err, "data type inconsistency");
++				return -EINVAL;
++			}
++
++			if (!a.v->io_time[READ] &&
++			    test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
++				prt_printf(err, "cached bucket with read_time == 0");
++				return -EINVAL;
++			}
++			break;
++		case BCH_DATA_stripe:
++			if (!a.v->stripe) {
++				prt_printf(err, "data_type %s but stripe==0",
++				       bch2_data_types[a.v->data_type]);
++				return -EINVAL;
++			}
++			break;
++		}
++	}
++
++	return 0;
++}
++
++static inline u64 swab40(u64 x)
++{
++	return (((x & 0x00000000ffULL) << 32)|
++		((x & 0x000000ff00ULL) << 16)|
++		((x & 0x0000ff0000ULL) >>  0)|
++		((x & 0x00ff000000ULL) >> 16)|
++		((x & 0xff00000000ULL) >> 32));
++}
++
++void bch2_alloc_v4_swab(struct bkey_s k)
++{
++	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
++	struct bch_backpointer *bp, *bps;
++
++	a->journal_seq		= swab64(a->journal_seq);
++	a->flags		= swab32(a->flags);
++	a->dirty_sectors	= swab32(a->dirty_sectors);
++	a->cached_sectors	= swab32(a->cached_sectors);
++	a->io_time[0]		= swab64(a->io_time[0]);
++	a->io_time[1]		= swab64(a->io_time[1]);
++	a->stripe		= swab32(a->stripe);
++	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
++
++	bps = alloc_v4_backpointers(a);
++	for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
++		bp->bucket_offset	= swab40(bp->bucket_offset);
++		bp->bucket_len		= swab32(bp->bucket_len);
++		bch2_bpos_swab(&bp->pos);
++	}
++}
++
++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
++{
++	struct bch_alloc_v4 _a;
++	const struct bch_alloc_v4 *a = &_a;
++	const struct bch_backpointer *bps;
++	unsigned i;
++
++	if (k.k->type == KEY_TYPE_alloc_v4)
++		a = bkey_s_c_to_alloc_v4(k).v;
++	else
++		bch2_alloc_to_v4(k, &_a);
++
++	prt_newline(out);
++	printbuf_indent_add(out, 2);
++
++	prt_printf(out, "gen %u oldest_gen %u data_type %s",
++	       a->gen, a->oldest_gen, bch2_data_types[a->data_type]);
++	prt_newline(out);
++	prt_printf(out, "journal_seq       %llu",	a->journal_seq);
++	prt_newline(out);
++	prt_printf(out, "need_discard      %llu",	BCH_ALLOC_V4_NEED_DISCARD(a));
++	prt_newline(out);
++	prt_printf(out, "need_inc_gen      %llu",	BCH_ALLOC_V4_NEED_INC_GEN(a));
++	prt_newline(out);
++	prt_printf(out, "dirty_sectors     %u",	a->dirty_sectors);
++	prt_newline(out);
++	prt_printf(out, "cached_sectors    %u",	a->cached_sectors);
++	prt_newline(out);
++	prt_printf(out, "stripe            %u",	a->stripe);
++	prt_newline(out);
++	prt_printf(out, "stripe_redundancy %u",	a->stripe_redundancy);
++	prt_newline(out);
++	prt_printf(out, "io_time[READ]     %llu",	a->io_time[READ]);
++	prt_newline(out);
++	prt_printf(out, "io_time[WRITE]    %llu",	a->io_time[WRITE]);
++	prt_newline(out);
++	prt_printf(out, "backpointers:     %llu",	BCH_ALLOC_V4_NR_BACKPOINTERS(a));
++	printbuf_indent_add(out, 2);
++
++	bps = alloc_v4_backpointers_c(a);
++	for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a); i++) {
++		prt_newline(out);
++		bch2_backpointer_to_text(out, &bps[i]);
++	}
++
++	printbuf_indent_sub(out, 4);
++}
++
++void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
++{
++	if (k.k->type == KEY_TYPE_alloc_v4) {
++		int d;
++
++		*out = *bkey_s_c_to_alloc_v4(k).v;
++
++		d = (int) BCH_ALLOC_V4_U64s -
++			(int) (BCH_ALLOC_V4_BACKPOINTERS_START(out) ?: BCH_ALLOC_V4_U64s_V0);
++		if (unlikely(d > 0)) {
++			memset((u64 *) out + BCH_ALLOC_V4_BACKPOINTERS_START(out),
++			       0,
++			       d * sizeof(u64));
++			SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
++		}
++	} else {
++		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
++
++		*out = (struct bch_alloc_v4) {
++			.journal_seq		= u.journal_seq,
++			.flags			= u.need_discard,
++			.gen			= u.gen,
++			.oldest_gen		= u.oldest_gen,
++			.data_type		= u.data_type,
++			.stripe_redundancy	= u.stripe_redundancy,
++			.dirty_sectors		= u.dirty_sectors,
++			.cached_sectors		= u.cached_sectors,
++			.io_time[READ]		= u.read_time,
++			.io_time[WRITE]		= u.write_time,
++			.stripe			= u.stripe,
++		};
++
++		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
++	}
++}
++
++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
++{
++	unsigned bytes = k.k->type == KEY_TYPE_alloc_v4
++		? bkey_bytes(k.k)
++		: sizeof(struct bkey_i_alloc_v4);
++	struct bkey_i_alloc_v4 *ret;
++
++	/*
++	 * Reserve space for one more backpointer here:
++	 * Not sketchy at doing it this way, nope...
++	 */
++	ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer));
++	if (IS_ERR(ret))
++		return ret;
++
++	if (k.k->type == KEY_TYPE_alloc_v4) {
++		bkey_reassemble(&ret->k_i, k);
++
++		if (BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v) < BCH_ALLOC_V4_U64s) {
++			struct bch_backpointer *src, *dst;
++
++			src = alloc_v4_backpointers(&ret->v);
++			SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
++			dst = alloc_v4_backpointers(&ret->v);
++
++			memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
++				sizeof(struct bch_backpointer));
++			memset(src, 0, dst - src);
++			set_alloc_v4_u64s(ret);
++		}
++	} else {
++		bkey_alloc_v4_init(&ret->k_i);
++		ret->k.p = k.k->p;
++		bch2_alloc_to_v4(k, &ret->v);
++	}
++	return ret;
++}
++
++int bch2_alloc_read(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_alloc_v4 a;
++	struct bch_dev *ca;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		/*
++		 * Not a fsck error because this is checked/repaired by
++		 * bch2_check_alloc_key() which runs later:
++		 */
++		if (!bch2_dev_bucket_exists(c, k.k->p))
++			continue;
++
++		ca = bch_dev_bkey_exists(c, k.k->p.inode);
++		bch2_alloc_to_v4(k, &a);
++
++		*bucket_gen(ca, k.k->p.offset) = a.gen;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
++
++	return ret;
++}
++
++/* Free space/discard btree: */
++
++static int bch2_bucket_do_index(struct btree_trans *trans,
++				struct bkey_s_c alloc_k,
++				const struct bch_alloc_v4 *a,
++				bool set)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
++	struct btree_iter iter;
++	struct bkey_s_c old;
++	struct bkey_i *k;
++	enum btree_id btree;
++	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
++	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	if (a->data_type != BCH_DATA_free &&
++	    a->data_type != BCH_DATA_need_discard)
++		return 0;
++
++	k = bch2_trans_kmalloc(trans, sizeof(*k));
++	if (IS_ERR(k))
++		return PTR_ERR(k);
++
++	bkey_init(&k->k);
++	k->k.type = new_type;
++
++	switch (a->data_type) {
++	case BCH_DATA_free:
++		btree = BTREE_ID_freespace;
++		k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
++		bch2_key_resize(&k->k, 1);
++		break;
++	case BCH_DATA_need_discard:
++		btree = BTREE_ID_need_discard;
++		k->k.p = alloc_k.k->p;
++		break;
++	default:
++		return 0;
++	}
++
++	bch2_trans_iter_init(trans, &iter, btree,
++			     bkey_start_pos(&k->k),
++			     BTREE_ITER_INTENT);
++	old = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(old);
++	if (ret)
++		goto err;
++
++	if (ca->mi.freespace_initialized &&
++	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
++			"incorrect key when %s %s btree (got %s should be %s)\n"
++			"  for %s",
++			set ? "setting" : "clearing",
++			bch2_btree_ids[btree],
++			bch2_bkey_types[old.k->type],
++			bch2_bkey_types[old_type],
++			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
++		ret = -EIO;
++		goto err;
++	}
++
++	ret = bch2_trans_update(trans, &iter, k, 0);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_trans_mark_alloc(struct btree_trans *trans,
++			  enum btree_id btree_id, unsigned level,
++			  struct bkey_s_c old, struct bkey_i *new,
++			  unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_alloc_v4 old_a, *new_a;
++	u64 old_lru, new_lru;
++	int ret = 0;
++
++	/*
++	 * Deletion only happens in the device removal path, with
++	 * BTREE_TRIGGER_NORUN:
++	 */
++	BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
++
++	bch2_alloc_to_v4(old, &old_a);
++	new_a = &bkey_i_to_alloc_v4(new)->v;
++
++	new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
++
++	if (new_a->dirty_sectors > old_a.dirty_sectors ||
++	    new_a->cached_sectors > old_a.cached_sectors) {
++		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
++		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
++		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
++		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
++	}
++
++	if (data_type_is_empty(new_a->data_type) &&
++	    BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
++	    !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
++		new_a->gen++;
++		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
++	}
++
++	if (old_a.data_type != new_a->data_type ||
++	    (new_a->data_type == BCH_DATA_free &&
++	     alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
++		ret =   bch2_bucket_do_index(trans, old, &old_a, false) ?:
++			bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
++		if (ret)
++			return ret;
++	}
++
++	if (new_a->data_type == BCH_DATA_cached &&
++	    !new_a->io_time[READ])
++		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
++
++	old_lru = alloc_lru_idx(old_a);
++	new_lru = alloc_lru_idx(*new_a);
++
++	if (old_lru != new_lru) {
++		ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
++				      old_lru, &new_lru, old);
++		if (ret)
++			return ret;
++
++		if (new_a->data_type == BCH_DATA_cached)
++			new_a->io_time[READ] = new_lru;
++	}
++
++	return 0;
++}
++
++static int bch2_check_alloc_key(struct btree_trans *trans,
++				struct btree_iter *alloc_iter,
++				struct btree_iter *discard_iter,
++				struct btree_iter *freespace_iter)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_dev *ca;
++	struct bch_alloc_v4 a;
++	unsigned discard_key_type, freespace_key_type;
++	struct bkey_s_c alloc_k, k;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	alloc_k = bch2_dev_bucket_exists(c, alloc_iter->pos)
++		? bch2_btree_iter_peek_slot(alloc_iter)
++		: bch2_btree_iter_peek(alloc_iter);
++	if (!alloc_k.k)
++		return 1;
++
++	ret = bkey_err(alloc_k);
++	if (ret)
++		return ret;
++
++	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
++			"alloc key for invalid device:bucket %llu:%llu",
++			alloc_k.k->p.inode, alloc_k.k->p.offset))
++		return bch2_btree_delete_at(trans, alloc_iter, 0);
++
++	ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
++	if (!ca->mi.freespace_initialized)
++		return 0;
++
++	bch2_alloc_to_v4(alloc_k, &a);
++
++	discard_key_type = a.data_type == BCH_DATA_need_discard
++		? KEY_TYPE_set : 0;
++	freespace_key_type = a.data_type == BCH_DATA_free
++		? KEY_TYPE_set : 0;
++
++	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
++	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, a));
++
++	k = bch2_btree_iter_peek_slot(discard_iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != discard_key_type &&
++	    (c->opts.reconstruct_alloc ||
++	     fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
++		      "  %s",
++		      bch2_bkey_types[k.k->type],
++		      bch2_bkey_types[discard_key_type],
++		      (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
++		struct bkey_i *update =
++			bch2_trans_kmalloc(trans, sizeof(*update));
++
++		ret = PTR_ERR_OR_ZERO(update);
++		if (ret)
++			goto err;
++
++		bkey_init(&update->k);
++		update->k.type	= discard_key_type;
++		update->k.p	= discard_iter->pos;
++
++		ret = bch2_trans_update(trans, discard_iter, update, 0);
++		if (ret)
++			goto err;
++	}
++
++	k = bch2_btree_iter_peek_slot(freespace_iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != freespace_key_type &&
++	    (c->opts.reconstruct_alloc ||
++	     fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
++		      "  %s",
++		      bch2_bkey_types[k.k->type],
++		      bch2_bkey_types[freespace_key_type],
++		      (printbuf_reset(&buf),
++		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
++		struct bkey_i *update =
++			bch2_trans_kmalloc(trans, sizeof(*update));
++
++		ret = PTR_ERR_OR_ZERO(update);
++		if (ret)
++			goto err;
++
++		bkey_init(&update->k);
++		update->k.type	= freespace_key_type;
++		update->k.p	= freespace_iter->pos;
++		bch2_key_resize(&update->k, 1);
++
++		ret = bch2_trans_update(trans, freespace_iter, update, 0);
++		if (ret)
++			goto err;
++	}
++err:
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int bch2_check_discard_freespace_key(struct btree_trans *trans,
++					    struct btree_iter *iter)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter alloc_iter;
++	struct bkey_s_c alloc_k;
++	struct bch_alloc_v4 a;
++	u64 genbits;
++	struct bpos pos;
++	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
++		? BCH_DATA_need_discard
++		: BCH_DATA_free;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	pos = iter->pos;
++	pos.offset &= ~(~0ULL << 56);
++	genbits = iter->pos.offset & (~0ULL << 56);
++
++	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
++
++	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
++			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
++			bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
++		goto delete;
++
++	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
++	ret = bkey_err(alloc_k);
++	if (ret)
++		goto err;
++
++	bch2_alloc_to_v4(alloc_k, &a);
++
++	if (fsck_err_on(a.data_type != state ||
++			(state == BCH_DATA_free &&
++			 genbits != alloc_freespace_genbits(a)), c,
++			"%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
++			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
++			bch2_btree_ids[iter->btree_id],
++			a.data_type == state,
++			genbits >> 56, alloc_freespace_genbits(a) >> 56))
++		goto delete;
++out:
++err:
++fsck_err:
++	bch2_trans_iter_exit(trans, &alloc_iter);
++	printbuf_exit(&buf);
++	return ret;
++delete:
++	ret = bch2_btree_delete_extent_at(trans, iter,
++			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0);
++	goto out;
++}
++
++int bch2_check_alloc_info(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter, discard_iter, freespace_iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
++			     BTREE_ITER_PREFETCH);
++	bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
++			     BTREE_ITER_PREFETCH);
++	bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
++			     BTREE_ITER_PREFETCH);
++	while (1) {
++		ret = commit_do(&trans, NULL, NULL,
++				      BTREE_INSERT_NOFAIL|
++				      BTREE_INSERT_LAZY_RW,
++			bch2_check_alloc_key(&trans, &iter,
++					     &discard_iter,
++					     &freespace_iter));
++		if (ret)
++			break;
++
++		bch2_btree_iter_advance(&iter);
++	}
++	bch2_trans_iter_exit(&trans, &freespace_iter);
++	bch2_trans_iter_exit(&trans, &discard_iter);
++	bch2_trans_iter_exit(&trans, &iter);
++
++	if (ret < 0)
++		goto err;
++
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_need_discard, POS_MIN,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		bch2_check_discard_freespace_key(&trans, &iter)) ?:
++	      for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_freespace, POS_MIN,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		bch2_check_discard_freespace_key(&trans, &iter));
++err:
++	bch2_trans_exit(&trans);
++	return ret < 0 ? ret : 0;
++}
++
++static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
++				       struct btree_iter *alloc_iter)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter lru_iter;
++	struct bch_alloc_v4 a;
++	struct bkey_s_c alloc_k, k;
++	struct printbuf buf = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++	int ret;
++
++	alloc_k = bch2_btree_iter_peek(alloc_iter);
++	if (!alloc_k.k)
++		return 0;
++
++	ret = bkey_err(alloc_k);
++	if (ret)
++		return ret;
++
++	bch2_alloc_to_v4(alloc_k, &a);
++
++	if (a.data_type != BCH_DATA_cached)
++		return 0;
++
++	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
++			     POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
++
++	k = bch2_btree_iter_peek_slot(&lru_iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (fsck_err_on(!a.io_time[READ], c,
++			"cached bucket with read_time 0\n"
++			"  %s",
++		(printbuf_reset(&buf),
++		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
++	    fsck_err_on(k.k->type != KEY_TYPE_lru ||
++			le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
++			"incorrect/missing lru entry\n"
++			"  %s\n"
++			"  %s",
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
++			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
++		u64 read_time = a.io_time[READ];
++
++		if (!a.io_time[READ])
++			a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
++
++		ret = bch2_lru_set(trans,
++				   alloc_k.k->p.inode,
++				   alloc_k.k->p.offset,
++				   &a.io_time[READ]);
++		if (ret)
++			goto err;
++
++		if (a.io_time[READ] != read_time) {
++			struct bkey_i_alloc_v4 *a_mut =
++				bch2_alloc_to_v4_mut(trans, alloc_k);
++			ret = PTR_ERR_OR_ZERO(a_mut);
++			if (ret)
++				goto err;
++
++			a_mut->v.io_time[READ] = a.io_time[READ];
++			ret = bch2_trans_update(trans, alloc_iter,
++						&a_mut->k_i, BTREE_TRIGGER_NORUN);
++			if (ret)
++				goto err;
++		}
++	}
++err:
++fsck_err:
++	bch2_trans_iter_exit(trans, &lru_iter);
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
++			POS_MIN, BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		bch2_check_alloc_to_lru_ref(&trans, &iter));
++
++	bch2_trans_exit(&trans);
++	return ret < 0 ? ret : 0;
++}
++
++static int bch2_discard_one_bucket(struct btree_trans *trans,
++				   struct btree_iter *need_discard_iter,
++				   struct bpos *discard_pos_done,
++				   u64 *seen,
++				   u64 *open,
++				   u64 *need_journal_commit,
++				   u64 *discarded)
++{
++	struct bch_fs *c = trans->c;
++	struct bpos pos = need_discard_iter->pos;
++	struct btree_iter iter = { NULL };
++	struct bkey_s_c k;
++	struct bch_dev *ca;
++	struct bkey_i_alloc_v4 *a;
++	struct printbuf buf = PRINTBUF;
++	bool did_discard = false;
++	int ret = 0;
++
++	ca = bch_dev_bkey_exists(c, pos.inode);
++	if (!percpu_ref_tryget(&ca->io_ref)) {
++		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
++		return 0;
++	}
++
++	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
++		(*open)++;
++		goto out;
++	}
++
++	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
++			c->journal.flushed_seq_ondisk,
++			pos.inode, pos.offset)) {
++		(*need_journal_commit)++;
++		goto out;
++	}
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
++			     need_discard_iter->pos,
++			     BTREE_ITER_CACHED);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto out;
++
++	a = bch2_alloc_to_v4_mut(trans, k);
++	ret = PTR_ERR_OR_ZERO(a);
++	if (ret)
++		goto out;
++
++	if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
++		a->v.gen++;
++		SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
++		goto write;
++	}
++
++	if (bch2_trans_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, trans,
++			"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
++			"%s",
++			a->v.journal_seq,
++			c->journal.flushed_seq_ondisk,
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = -EIO;
++		goto out;
++	}
++
++	if (bch2_trans_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, trans,
++			"bucket incorrectly set in need_discard btree\n"
++			"%s",
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = -EIO;
++		goto out;
++	}
++
++	if (bkey_cmp(*discard_pos_done, iter.pos) &&
++	    ca->mi.discard && !c->opts.nochanges) {
++		/*
++		 * This works without any other locks because this is the only
++		 * thread that removes items from the need_discard tree
++		 */
++		bch2_trans_unlock(trans);
++		blkdev_issue_discard(ca->disk_sb.bdev,
++				     k.k->p.offset * ca->mi.bucket_size,
++				     ca->mi.bucket_size,
++				     GFP_KERNEL, 0);
++
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			goto out;
++	}
++
++	*discard_pos_done = iter.pos;
++	did_discard = true;
++
++	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
++	a->v.data_type = alloc_data_type(a->v, a->v.data_type);
++write:
++	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
++		bch2_trans_commit(trans, NULL, NULL,
++				  BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
++	if (ret)
++		goto out;
++
++	if (did_discard) {
++		this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
++		(*discarded)++;
++	}
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	percpu_ref_put(&ca->io_ref);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static void bch2_do_discards_work(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
++	struct bpos discard_pos_done = POS_MAX;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	/*
++	 * We're doing the commit in bch2_discard_one_bucket instead of using
++	 * for_each_btree_key_commit() so that we can increment counters after
++	 * successful commit:
++	 */
++	ret = for_each_btree_key2(&trans, iter,
++			BTREE_ID_need_discard, POS_MIN, 0, k,
++		bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
++					&seen,
++					&open,
++					&need_journal_commit,
++					&discarded));
++
++	bch2_trans_exit(&trans);
++
++	if (need_journal_commit * 2 > seen)
++		bch2_journal_flush_async(&c->journal, NULL);
++
++	percpu_ref_put(&c->writes);
++
++	trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
++			      bch2_err_str(ret));
++}
++
++void bch2_do_discards(struct bch_fs *c)
++{
++	if (percpu_ref_tryget_live(&c->writes) &&
++	    !queue_work(system_long_wq, &c->discard_work))
++		percpu_ref_put(&c->writes);
++}
++
++static int invalidate_one_bucket(struct btree_trans *trans,
++				 struct btree_iter *lru_iter, struct bkey_s_c k,
++				 unsigned dev_idx, s64 *nr_to_invalidate)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter alloc_iter = { NULL };
++	struct bkey_i_alloc_v4 *a;
++	struct bpos bucket;
++	struct printbuf buf = PRINTBUF;
++	unsigned cached_sectors;
++	int ret = 0;
++
++	if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx)
++		return 1;
++
++	if (k.k->type != KEY_TYPE_lru) {
++		prt_printf(&buf, "non lru key in lru btree:\n  ");
++		bch2_bkey_val_to_text(&buf, c, k);
++
++		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
++			bch_err(c, "%s", buf.buf);
++		} else {
++			bch2_trans_inconsistent(trans, "%s", buf.buf);
++			ret = -EINVAL;
++		}
++
++		goto out;
++	}
++
++	bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx));
++
++	a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
++	ret = PTR_ERR_OR_ZERO(a);
++	if (ret)
++		goto out;
++
++	if (k.k->p.offset != alloc_lru_idx(a->v)) {
++		prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
++		prt_printf(&buf, "\n  ");
++		bch2_bkey_val_to_text(&buf, c, k);
++
++		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
++			bch_err(c, "%s", buf.buf);
++		} else {
++			bch2_trans_inconsistent(trans, "%s", buf.buf);
++			ret = -EINVAL;
++		}
++
++		goto out;
++	}
++
++	if (!a->v.cached_sectors)
++		bch_err(c, "invalidating empty bucket, confused");
++
++	cached_sectors = a->v.cached_sectors;
++
++	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
++	a->v.gen++;
++	a->v.data_type		= 0;
++	a->v.dirty_sectors	= 0;
++	a->v.cached_sectors	= 0;
++	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
++	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
++
++	ret =   bch2_trans_update(trans, &alloc_iter, &a->k_i,
++				BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
++		bch2_trans_commit(trans, NULL, NULL,
++				  BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
++	if (ret)
++		goto out;
++
++	trace_invalidate_bucket(c, bucket.inode, bucket.offset, cached_sectors);
++	this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
++	--*nr_to_invalidate;
++out:
++	bch2_trans_iter_exit(trans, &alloc_iter);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static void bch2_do_invalidates_work(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
++	struct bch_dev *ca;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	unsigned i;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_member_device(ca, c, i) {
++		s64 nr_to_invalidate =
++			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
++
++		ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru,
++				POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k,
++			invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate));
++
++		if (ret < 0) {
++			percpu_ref_put(&ca->ref);
++			break;
++		}
++	}
++
++	bch2_trans_exit(&trans);
++	percpu_ref_put(&c->writes);
++}
++
++void bch2_do_invalidates(struct bch_fs *c)
++{
++	if (percpu_ref_tryget_live(&c->writes) &&
++	    !queue_work(system_long_wq, &c->invalidate_work))
++		percpu_ref_put(&c->writes);
++}
++
++static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter,
++				 struct bkey_s_c k, struct bch_dev *ca)
++{
++	struct bch_alloc_v4 a;
++
++	if (iter->pos.offset >= ca->mi.nbuckets)
++		return 1;
++
++	bch2_alloc_to_v4(k, &a);
++	return bch2_bucket_do_index(trans, k, &a, true);
++}
++
++static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_member *m;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
++			POS(ca->dev_idx, ca->mi.first_bucket),
++			BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW,
++		bucket_freespace_init(&trans, &iter, k, ca));
++
++	bch2_trans_exit(&trans);
++
++	if (ret < 0) {
++		bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
++		return ret;
++	}
++
++	mutex_lock(&c->sb_lock);
++	m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
++	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
++	mutex_unlock(&c->sb_lock);
++
++	return 0;
++}
++
++int bch2_fs_freespace_init(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned i;
++	int ret = 0;
++	bool doing_init = false;
++
++	/*
++	 * We can crash during the device add path, so we need to check this on
++	 * every mount:
++	 */
++
++	for_each_member_device(ca, c, i) {
++		if (ca->mi.freespace_initialized)
++			continue;
++
++		if (!doing_init) {
++			bch_info(c, "initializing freespace");
++			doing_init = true;
++		}
++
++		ret = bch2_dev_freespace_init(c, ca);
++		if (ret) {
++			percpu_ref_put(&ca->ref);
++			return ret;
++		}
++	}
++
++	if (doing_init) {
++		mutex_lock(&c->sb_lock);
++		bch2_write_super(c);
++		mutex_unlock(&c->sb_lock);
++
++		bch_verbose(c, "done initializing freespace");
++	}
++
++	return ret;
++}
++
++/* Bucket IO clocks: */
++
++int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
++			      size_t bucket_nr, int rw)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_i_alloc_v4 *a;
++	u64 now;
++	int ret = 0;
++
++	a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
++	ret = PTR_ERR_OR_ZERO(a);
++	if (ret)
++		return ret;
++
++	now = atomic64_read(&c->io_clock[rw].now);
++	if (a->v.io_time[rw] == now)
++		goto out;
++
++	a->v.io_time[rw] = now;
++
++	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
++		bch2_trans_commit(trans, NULL, NULL, 0);
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++/* Startup/shutdown (ro/rw): */
++
++void bch2_recalc_capacity(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
++	unsigned bucket_size_max = 0;
++	unsigned long ra_pages = 0;
++	unsigned i;
++
++	lockdep_assert_held(&c->state_lock);
++
++	for_each_online_member(ca, c, i) {
++		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
++
++		ra_pages += bdi->ra_pages;
++	}
++
++	bch2_set_ra_pages(c, ra_pages);
++
++	for_each_rw_member(ca, c, i) {
++		u64 dev_reserve = 0;
++
++		/*
++		 * We need to reserve buckets (from the number
++		 * of currently available buckets) against
++		 * foreground writes so that mainly copygc can
++		 * make forward progress.
++		 *
++		 * We need enough to refill the various reserves
++		 * from scratch - copygc will use its entire
++		 * reserve all at once, then run against when
++		 * its reserve is refilled (from the formerly
++		 * available buckets).
++		 *
++		 * This reserve is just used when considering if
++		 * allocations for foreground writes must wait -
++		 * not -ENOSPC calculations.
++		 */
++
++		dev_reserve += ca->nr_btree_reserve * 2;
++		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
++
++		dev_reserve += 1;	/* btree write point */
++		dev_reserve += 1;	/* copygc write point */
++		dev_reserve += 1;	/* rebalance write point */
++
++		dev_reserve *= ca->mi.bucket_size;
++
++		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
++					     ca->mi.first_bucket);
++
++		reserved_sectors += dev_reserve * 2;
++
++		bucket_size_max = max_t(unsigned, bucket_size_max,
++					ca->mi.bucket_size);
++	}
++
++	gc_reserve = c->opts.gc_reserve_bytes
++		? c->opts.gc_reserve_bytes >> 9
++		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
++
++	reserved_sectors = max(gc_reserve, reserved_sectors);
++
++	reserved_sectors = min(reserved_sectors, capacity);
++
++	c->capacity = capacity - reserved_sectors;
++
++	c->bucket_size_max = bucket_size_max;
++
++	/* Wake up case someone was waiting for buckets */
++	closure_wake_up(&c->freelist_wait);
++}
++
++static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
++{
++	struct open_bucket *ob;
++	bool ret = false;
++
++	for (ob = c->open_buckets;
++	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
++	     ob++) {
++		spin_lock(&ob->lock);
++		if (ob->valid && !ob->on_partial_list &&
++		    ob->dev == ca->dev_idx)
++			ret = true;
++		spin_unlock(&ob->lock);
++	}
++
++	return ret;
++}
++
++/* device goes ro: */
++void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
++{
++	unsigned i;
++
++	/* First, remove device from allocation groups: */
++
++	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
++		clear_bit(ca->dev_idx, c->rw_devs[i].d);
++
++	/*
++	 * Capacity is calculated based off of devices in allocation groups:
++	 */
++	bch2_recalc_capacity(c);
++
++	/* Next, close write points that point to this device... */
++	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
++		bch2_writepoint_stop(c, ca, &c->write_points[i]);
++
++	bch2_writepoint_stop(c, ca, &c->copygc_write_point);
++	bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
++	bch2_writepoint_stop(c, ca, &c->btree_write_point);
++
++	mutex_lock(&c->btree_reserve_cache_lock);
++	while (c->btree_reserve_cache_nr) {
++		struct btree_alloc *a =
++			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
++
++		bch2_open_buckets_put(c, &a->ob);
++	}
++	mutex_unlock(&c->btree_reserve_cache_lock);
++
++	while (1) {
++		struct open_bucket *ob;
++
++		spin_lock(&c->freelist_lock);
++		if (!ca->open_buckets_partial_nr) {
++			spin_unlock(&c->freelist_lock);
++			break;
++		}
++		ob = c->open_buckets +
++			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
++		ob->on_partial_list = false;
++		spin_unlock(&c->freelist_lock);
++
++		bch2_open_bucket_put(c, ob);
++	}
++
++	bch2_ec_stop_dev(c, ca);
++
++	/*
++	 * Wake up threads that were blocked on allocation, so they can notice
++	 * the device can no longer be removed and the capacity has changed:
++	 */
++	closure_wake_up(&c->freelist_wait);
++
++	/*
++	 * journal_res_get() can block waiting for free space in the journal -
++	 * it needs to notice there may not be devices to allocate from anymore:
++	 */
++	wake_up(&c->journal.wait);
++
++	/* Now wait for any in flight writes: */
++
++	closure_wait_event(&c->open_buckets_wait,
++			   !bch2_dev_has_open_write_point(c, ca));
++}
++
++/* device goes rw: */
++void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
++{
++	unsigned i;
++
++	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
++		if (ca->mi.data_allowed & (1 << i))
++			set_bit(ca->dev_idx, c->rw_devs[i].d);
++}
++
++void bch2_fs_allocator_background_init(struct bch_fs *c)
++{
++	spin_lock_init(&c->freelist_lock);
++	INIT_WORK(&c->discard_work, bch2_do_discards_work);
++	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
++}
+diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
+new file mode 100644
+index 000000000000..044bc72992d4
+--- /dev/null
++++ b/fs/bcachefs/alloc_background.h
+@@ -0,0 +1,183 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
++#define _BCACHEFS_ALLOC_BACKGROUND_H
++
++#include "bcachefs.h"
++#include "alloc_types.h"
++#include "buckets.h"
++#include "debug.h"
++#include "super.h"
++
++/* How out of date a pointer gen is allowed to be: */
++#define BUCKET_GC_GEN_MAX	96U
++
++static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
++{
++	struct bch_dev *ca;
++
++	if (!bch2_dev_exists2(c, pos.inode))
++		return false;
++
++	ca = bch_dev_bkey_exists(c, pos.inode);
++	return pos.offset >= ca->mi.first_bucket &&
++		pos.offset < ca->mi.nbuckets;
++}
++
++static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
++{
++	return a.gen - a.oldest_gen;
++}
++
++static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
++						   u32 cached_sectors,
++						   u32 stripe,
++						   struct bch_alloc_v4 a,
++						   enum bch_data_type data_type)
++{
++	if (dirty_sectors)
++		return data_type;
++	if (stripe)
++		return BCH_DATA_stripe;
++	if (cached_sectors)
++		return BCH_DATA_cached;
++	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
++		return BCH_DATA_need_discard;
++	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
++		return BCH_DATA_need_gc_gens;
++	return BCH_DATA_free;
++}
++
++static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
++						 enum bch_data_type data_type)
++{
++	return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
++				 a.stripe, a, data_type);
++}
++
++static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
++{
++	return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
++}
++
++static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
++{
++	return ((u64) alloc_gc_gen(a) >> 4) << 56;
++}
++
++static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
++{
++	pos.offset |= alloc_freespace_genbits(a);
++	return pos;
++}
++
++static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
++{
++	unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
++			BCH_ALLOC_V4_U64s_V0) +
++		BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
++		(sizeof(struct bch_backpointer) / sizeof(u64));
++
++	BUG_ON(ret > U8_MAX - BKEY_U64s);
++	return ret;
++}
++
++static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
++{
++	set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
++}
++
++struct bkey_i_alloc_v4 *
++bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
++
++void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
++
++int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
++
++#define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
++
++int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++void bch2_alloc_v4_swab(struct bkey_s);
++void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++
++#define bch2_bkey_ops_alloc (struct bkey_ops) {		\
++	.key_invalid	= bch2_alloc_v1_invalid,	\
++	.val_to_text	= bch2_alloc_to_text,		\
++	.trans_trigger	= bch2_trans_mark_alloc,	\
++	.atomic_trigger	= bch2_mark_alloc,		\
++}
++
++#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {	\
++	.key_invalid	= bch2_alloc_v2_invalid,	\
++	.val_to_text	= bch2_alloc_to_text,		\
++	.trans_trigger	= bch2_trans_mark_alloc,	\
++	.atomic_trigger	= bch2_mark_alloc,		\
++}
++
++#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {	\
++	.key_invalid	= bch2_alloc_v3_invalid,	\
++	.val_to_text	= bch2_alloc_to_text,		\
++	.trans_trigger	= bch2_trans_mark_alloc,	\
++	.atomic_trigger	= bch2_mark_alloc,		\
++}
++
++#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) {	\
++	.key_invalid	= bch2_alloc_v4_invalid,	\
++	.val_to_text	= bch2_alloc_to_text,		\
++	.swab		= bch2_alloc_v4_swab,		\
++	.trans_trigger	= bch2_trans_mark_alloc,	\
++	.atomic_trigger	= bch2_mark_alloc,		\
++}
++
++static inline bool bkey_is_alloc(const struct bkey *k)
++{
++	return  k->type == KEY_TYPE_alloc ||
++		k->type == KEY_TYPE_alloc_v2 ||
++		k->type == KEY_TYPE_alloc_v3;
++}
++
++int bch2_alloc_read(struct bch_fs *);
++
++int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
++			  struct bkey_s_c, struct bkey_i *, unsigned);
++int bch2_check_alloc_info(struct bch_fs *);
++int bch2_check_alloc_to_lru_refs(struct bch_fs *);
++void bch2_do_discards(struct bch_fs *);
++
++static inline u64 should_invalidate_buckets(struct bch_dev *ca,
++					    struct bch_dev_usage u)
++{
++	u64 want_free = ca->mi.nbuckets >> 7;
++	u64 free = max_t(s64, 0,
++			   u.d[BCH_DATA_free].buckets
++			 + u.d[BCH_DATA_need_discard].buckets
++			 - bch2_dev_buckets_reserved(ca, RESERVE_none));
++
++	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
++}
++
++void bch2_do_invalidates(struct bch_fs *);
++
++static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
++{
++	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
++}
++
++static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
++{
++	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
++}
++
++int bch2_fs_freespace_init(struct bch_fs *);
++
++void bch2_recalc_capacity(struct bch_fs *);
++
++void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
++void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
++
++void bch2_fs_allocator_background_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
+diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
+new file mode 100644
+index 000000000000..6e52230e69e1
+--- /dev/null
++++ b/fs/bcachefs/alloc_foreground.c
+@@ -0,0 +1,1380 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright 2012 Google, Inc.
++ *
++ * Foreground allocator code: allocate buckets from freelist, and allocate in
++ * sector granularity from writepoints.
++ *
++ * bch2_bucket_alloc() allocates a single bucket from a specific device.
++ *
++ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
++ * in a given filesystem.
++ */
++
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "alloc_foreground.h"
++#include "btree_iter.h"
++#include "btree_update.h"
++#include "btree_gc.h"
++#include "buckets.h"
++#include "buckets_waiting_for_journal.h"
++#include "clock.h"
++#include "debug.h"
++#include "disk_groups.h"
++#include "ec.h"
++#include "error.h"
++#include "io.h"
++#include "journal.h"
++#include "movinggc.h"
++
++#include <linux/math64.h>
++#include <linux/rculist.h>
++#include <linux/rcupdate.h>
++#include <trace/events/bcachefs.h>
++
++const char * const bch2_alloc_reserves[] = {
++#define x(t) #t,
++	BCH_ALLOC_RESERVES()
++#undef x
++	NULL
++};
++
++/*
++ * Open buckets represent a bucket that's currently being allocated from.  They
++ * serve two purposes:
++ *
++ *  - They track buckets that have been partially allocated, allowing for
++ *    sub-bucket sized allocations - they're used by the sector allocator below
++ *
++ *  - They provide a reference to the buckets they own that mark and sweep GC
++ *    can find, until the new allocation has a pointer to it inserted into the
++ *    btree
++ *
++ * When allocating some space with the sector allocator, the allocation comes
++ * with a reference to an open bucket - the caller is required to put that
++ * reference _after_ doing the index update that makes its allocation reachable.
++ */
++
++static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
++{
++	open_bucket_idx_t idx = ob - c->open_buckets;
++	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
++
++	ob->hash = *slot;
++	*slot = idx;
++}
++
++static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
++{
++	open_bucket_idx_t idx = ob - c->open_buckets;
++	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
++
++	while (*slot != idx) {
++		BUG_ON(!*slot);
++		slot = &c->open_buckets[*slot].hash;
++	}
++
++	*slot = ob->hash;
++	ob->hash = 0;
++}
++
++void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
++
++	if (ob->ec) {
++		bch2_ec_bucket_written(c, ob);
++		return;
++	}
++
++	percpu_down_read(&c->mark_lock);
++	spin_lock(&ob->lock);
++
++	ob->valid = false;
++	ob->data_type = 0;
++
++	spin_unlock(&ob->lock);
++	percpu_up_read(&c->mark_lock);
++
++	spin_lock(&c->freelist_lock);
++	bch2_open_bucket_hash_remove(c, ob);
++
++	ob->freelist = c->open_buckets_freelist;
++	c->open_buckets_freelist = ob - c->open_buckets;
++
++	c->open_buckets_nr_free++;
++	ca->nr_open_buckets--;
++	spin_unlock(&c->freelist_lock);
++
++	closure_wake_up(&c->open_buckets_wait);
++}
++
++void bch2_open_bucket_write_error(struct bch_fs *c,
++				  struct open_buckets *obs,
++				  unsigned dev)
++{
++	struct open_bucket *ob;
++	unsigned i;
++
++	open_bucket_for_each(c, obs, ob, i)
++		if (ob->dev == dev && ob->ec)
++			bch2_ec_bucket_cancel(c, ob);
++}
++
++static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
++{
++	struct open_bucket *ob;
++
++	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
++
++	ob = c->open_buckets + c->open_buckets_freelist;
++	c->open_buckets_freelist = ob->freelist;
++	atomic_set(&ob->pin, 1);
++	ob->data_type = 0;
++
++	c->open_buckets_nr_free--;
++	return ob;
++}
++
++static void open_bucket_free_unused(struct bch_fs *c,
++				    struct write_point *wp,
++				    struct open_bucket *ob)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
++	bool may_realloc = wp->data_type == BCH_DATA_user;
++
++	BUG_ON(ca->open_buckets_partial_nr >
++	       ARRAY_SIZE(ca->open_buckets_partial));
++
++	if (ca->open_buckets_partial_nr <
++	    ARRAY_SIZE(ca->open_buckets_partial) &&
++	    may_realloc) {
++		spin_lock(&c->freelist_lock);
++		ob->on_partial_list = true;
++		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
++			ob - c->open_buckets;
++		spin_unlock(&c->freelist_lock);
++
++		closure_wake_up(&c->open_buckets_wait);
++		closure_wake_up(&c->freelist_wait);
++	} else {
++		bch2_open_bucket_put(c, ob);
++	}
++}
++
++/* _only_ for allocating the journal on a new device: */
++long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
++{
++	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
++		u64 b = ca->new_fs_bucket_idx++;
++
++		if (!is_superblock_bucket(ca, b) &&
++		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
++			return b;
++	}
++
++	return -1;
++}
++
++static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
++{
++	switch (reserve) {
++	case RESERVE_btree:
++	case RESERVE_btree_movinggc:
++		return 0;
++	case RESERVE_movinggc:
++		return OPEN_BUCKETS_COUNT / 4;
++	default:
++		return OPEN_BUCKETS_COUNT / 2;
++	}
++}
++
++static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
++					      u64 bucket,
++					      enum alloc_reserve reserve,
++					      struct bch_alloc_v4 *a,
++					      u64 *skipped_open,
++					      u64 *skipped_need_journal_commit,
++					      u64 *skipped_nouse,
++					      struct closure *cl)
++{
++	struct open_bucket *ob;
++
++	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
++		(*skipped_nouse)++;
++		return NULL;
++	}
++
++	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
++		(*skipped_open)++;
++		return NULL;
++	}
++
++	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
++			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
++		(*skipped_need_journal_commit)++;
++		return NULL;
++	}
++
++	spin_lock(&c->freelist_lock);
++
++	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
++		if (cl)
++			closure_wait(&c->open_buckets_wait, cl);
++
++		if (!c->blocked_allocate_open_bucket)
++			c->blocked_allocate_open_bucket = local_clock();
++
++		spin_unlock(&c->freelist_lock);
++		return ERR_PTR(-BCH_ERR_open_buckets_empty);
++	}
++
++	/* Recheck under lock: */
++	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
++		spin_unlock(&c->freelist_lock);
++		(*skipped_open)++;
++		return NULL;
++	}
++
++	ob = bch2_open_bucket_alloc(c);
++
++	spin_lock(&ob->lock);
++
++	ob->valid	= true;
++	ob->sectors_free = ca->mi.bucket_size;
++	ob->alloc_reserve = reserve;
++	ob->dev		= ca->dev_idx;
++	ob->gen		= a->gen;
++	ob->bucket	= bucket;
++	spin_unlock(&ob->lock);
++
++	ca->nr_open_buckets++;
++	bch2_open_bucket_hash_add(c, ob);
++
++	if (c->blocked_allocate_open_bucket) {
++		bch2_time_stats_update(
++			&c->times[BCH_TIME_blocked_allocate_open_bucket],
++			c->blocked_allocate_open_bucket);
++		c->blocked_allocate_open_bucket = 0;
++	}
++
++	if (c->blocked_allocate) {
++		bch2_time_stats_update(
++			&c->times[BCH_TIME_blocked_allocate],
++			c->blocked_allocate);
++		c->blocked_allocate = 0;
++	}
++
++	spin_unlock(&c->freelist_lock);
++
++	trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
++	return ob;
++}
++
++static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
++					    enum alloc_reserve reserve, u64 free_entry,
++					    u64 *skipped_open,
++					    u64 *skipped_need_journal_commit,
++					    u64 *skipped_nouse,
++					    struct bkey_s_c freespace_k,
++					    struct closure *cl)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter = { NULL };
++	struct bkey_s_c k;
++	struct open_bucket *ob;
++	struct bch_alloc_v4 a;
++	u64 b = free_entry & ~(~0ULL << 56);
++	unsigned genbits = free_entry >> 56;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
++		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
++		       "  freespace key ",
++			ca->mi.first_bucket, ca->mi.nbuckets);
++		bch2_bkey_val_to_text(&buf, c, freespace_k);
++		bch2_trans_inconsistent(trans, "%s", buf.buf);
++		ob = ERR_PTR(-EIO);
++		goto err;
++	}
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret) {
++		ob = ERR_PTR(ret);
++		goto err;
++	}
++
++	bch2_alloc_to_v4(k, &a);
++
++	if (genbits != (alloc_freespace_genbits(a) >> 56)) {
++		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
++		       "  freespace key ",
++		       genbits, alloc_freespace_genbits(a) >> 56);
++		bch2_bkey_val_to_text(&buf, c, freespace_k);
++		prt_printf(&buf, "\n  ");
++		bch2_bkey_val_to_text(&buf, c, k);
++		bch2_trans_inconsistent(trans, "%s", buf.buf);
++		ob = ERR_PTR(-EIO);
++		goto err;
++
++	}
++
++	if (a.data_type != BCH_DATA_free) {
++		prt_printf(&buf, "non free bucket in freespace btree\n"
++		       "  freespace key ");
++		bch2_bkey_val_to_text(&buf, c, freespace_k);
++		prt_printf(&buf, "\n  ");
++		bch2_bkey_val_to_text(&buf, c, k);
++		bch2_trans_inconsistent(trans, "%s", buf.buf);
++		ob = ERR_PTR(-EIO);
++		goto err;
++	}
++
++	ob = __try_alloc_bucket(c, ca, b, reserve, &a,
++				skipped_open,
++				skipped_need_journal_commit,
++				skipped_nouse,
++				cl);
++	if (!ob)
++		iter.path->preserve = false;
++err:
++	set_btree_iter_dontneed(&iter);
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
++	return ob;
++}
++
++static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
++						    enum alloc_reserve reserve)
++{
++	struct open_bucket *ob;
++	int i;
++
++	spin_lock(&c->freelist_lock);
++
++	for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
++		ob = c->open_buckets + ca->open_buckets_partial[i];
++
++		if (reserve <= ob->alloc_reserve) {
++			array_remove_item(ca->open_buckets_partial,
++					  ca->open_buckets_partial_nr,
++					  i);
++			ob->on_partial_list = false;
++			ob->alloc_reserve = reserve;
++			spin_unlock(&c->freelist_lock);
++			return ob;
++		}
++	}
++
++	spin_unlock(&c->freelist_lock);
++	return NULL;
++}
++
++/*
++ * This path is for before the freespace btree is initialized:
++ *
++ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
++ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
++ */
++static noinline struct open_bucket *
++bch2_bucket_alloc_early(struct btree_trans *trans,
++			struct bch_dev *ca,
++			enum alloc_reserve reserve,
++			u64 *cur_bucket,
++			u64 *buckets_seen,
++			u64 *skipped_open,
++			u64 *skipped_need_journal_commit,
++			u64 *skipped_nouse,
++			struct closure *cl)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct open_bucket *ob = NULL;
++	int ret;
++
++	*cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket);
++	*cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx);
++
++	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
++			   BTREE_ITER_SLOTS, k, ret) {
++		struct bch_alloc_v4 a;
++
++		if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
++			break;
++
++		if (ca->new_fs_bucket_idx &&
++		    is_superblock_bucket(ca, k.k->p.offset))
++			continue;
++
++		bch2_alloc_to_v4(k, &a);
++
++		if (a.data_type != BCH_DATA_free)
++			continue;
++
++		(*buckets_seen)++;
++
++		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
++					skipped_open,
++					skipped_need_journal_commit,
++					skipped_nouse,
++					cl);
++		if (ob)
++			break;
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	*cur_bucket = iter.pos.offset;
++
++	return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
++}
++
++static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
++						   struct bch_dev *ca,
++						   enum alloc_reserve reserve,
++						   u64 *cur_bucket,
++						   u64 *buckets_seen,
++						   u64 *skipped_open,
++						   u64 *skipped_need_journal_commit,
++						   u64 *skipped_nouse,
++						   struct closure *cl)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct open_bucket *ob = NULL;
++	int ret;
++
++	BUG_ON(ca->new_fs_bucket_idx);
++
++	/*
++	 * XXX:
++	 * On transaction restart, we'd like to restart from the bucket we were
++	 * at previously
++	 */
++	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
++				     POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
++		if (k.k->p.inode != ca->dev_idx)
++			break;
++
++		for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
++		     *cur_bucket < k.k->p.offset;
++		     (*cur_bucket)++) {
++			ret = btree_trans_too_many_iters(trans);
++			if (ret)
++				break;
++
++			(*buckets_seen)++;
++
++			ob = try_alloc_bucket(trans, ca, reserve,
++					      *cur_bucket,
++					      skipped_open,
++					      skipped_need_journal_commit,
++					      skipped_nouse,
++					      k, cl);
++			if (ob)
++				break;
++		}
++
++		if (ob || ret)
++			break;
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ob ?: ERR_PTR(ret);
++}
++
++/**
++ * bch_bucket_alloc - allocate a single bucket from a specific device
++ *
++ * Returns index of bucket on success, 0 on failure
++ * */
++static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
++				      struct bch_dev *ca,
++				      enum alloc_reserve reserve,
++				      bool may_alloc_partial,
++				      struct closure *cl)
++{
++	struct bch_fs *c = trans->c;
++	struct open_bucket *ob = NULL;
++	struct bch_dev_usage usage;
++	bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized);
++	u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor;
++	u64 avail;
++	u64 cur_bucket = start;
++	u64 buckets_seen = 0;
++	u64 skipped_open = 0;
++	u64 skipped_need_journal_commit = 0;
++	u64 skipped_nouse = 0;
++	bool waiting = false;
++	int ret;
++again:
++	usage = bch2_dev_usage_read(ca);
++	avail = dev_buckets_free(ca, usage, reserve);
++
++	if (usage.d[BCH_DATA_need_discard].buckets > avail)
++		bch2_do_discards(c);
++
++	if (usage.d[BCH_DATA_need_gc_gens].buckets > avail)
++		bch2_do_gc_gens(c);
++
++	if (should_invalidate_buckets(ca, usage))
++		bch2_do_invalidates(c);
++
++	if (!avail) {
++		if (cl && !waiting) {
++			closure_wait(&c->freelist_wait, cl);
++			waiting = true;
++			goto again;
++		}
++
++		if (!c->blocked_allocate)
++			c->blocked_allocate = local_clock();
++
++		ob = ERR_PTR(-BCH_ERR_freelist_empty);
++		goto err;
++	}
++
++	if (waiting)
++		closure_wake_up(&c->freelist_wait);
++
++	if (may_alloc_partial) {
++		ob = try_alloc_partial_bucket(c, ca, reserve);
++		if (ob)
++			return ob;
++	}
++
++	ob = likely(ca->mi.freespace_initialized)
++		? bch2_bucket_alloc_freelist(trans, ca, reserve,
++					&cur_bucket,
++					&buckets_seen,
++					&skipped_open,
++					&skipped_need_journal_commit,
++					&skipped_nouse,
++					cl)
++		: bch2_bucket_alloc_early(trans, ca, reserve,
++					&cur_bucket,
++					&buckets_seen,
++					&skipped_open,
++					&skipped_need_journal_commit,
++					&skipped_nouse,
++					cl);
++
++	if (skipped_need_journal_commit * 2 > avail)
++		bch2_journal_flush_async(&c->journal, NULL);
++
++	if (!ob && !ret && !freespace_initialized && start) {
++		start = cur_bucket = 0;
++		goto again;
++	}
++
++	if (!freespace_initialized)
++		ca->bucket_alloc_trans_early_cursor = cur_bucket;
++err:
++	if (!ob)
++		ob = ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
++
++	if (IS_ERR(ob)) {
++		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve],
++					usage.d[BCH_DATA_free].buckets,
++					avail,
++					bch2_copygc_wait_amount(c),
++					c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
++					buckets_seen,
++					skipped_open,
++					skipped_need_journal_commit,
++					skipped_nouse,
++					cl == NULL,
++					bch2_err_str(PTR_ERR(ob)));
++		atomic_long_inc(&c->bucket_alloc_fail);
++	}
++
++	return ob;
++}
++
++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
++				      enum alloc_reserve reserve,
++				      bool may_alloc_partial,
++				      struct closure *cl)
++{
++	struct open_bucket *ob;
++
++	bch2_trans_do(c, NULL, NULL, 0,
++		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
++								   may_alloc_partial, cl)));
++	return ob;
++}
++
++static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
++			    unsigned l, unsigned r)
++{
++	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
++		(stripe->next_alloc[l] < stripe->next_alloc[r]));
++}
++
++#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
++
++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
++					  struct dev_stripe_state *stripe,
++					  struct bch_devs_mask *devs)
++{
++	struct dev_alloc_list ret = { .nr = 0 };
++	unsigned i;
++
++	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
++		ret.devs[ret.nr++] = i;
++
++	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
++	return ret;
++}
++
++void bch2_dev_stripe_increment(struct bch_dev *ca,
++			       struct dev_stripe_state *stripe)
++{
++	u64 *v = stripe->next_alloc + ca->dev_idx;
++	u64 free_space = dev_buckets_available(ca, RESERVE_none);
++	u64 free_space_inv = free_space
++		? div64_u64(1ULL << 48, free_space)
++		: 1ULL << 48;
++	u64 scale = *v / 4;
++
++	if (*v + free_space_inv >= *v)
++		*v += free_space_inv;
++	else
++		*v = U64_MAX;
++
++	for (v = stripe->next_alloc;
++	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
++		*v = *v < scale ? 0 : *v - scale;
++}
++
++#define BUCKET_MAY_ALLOC_PARTIAL	(1 << 0)
++#define BUCKET_ALLOC_USE_DURABILITY	(1 << 1)
++
++static void add_new_bucket(struct bch_fs *c,
++			   struct open_buckets *ptrs,
++			   struct bch_devs_mask *devs_may_alloc,
++			   unsigned *nr_effective,
++			   bool *have_cache,
++			   unsigned flags,
++			   struct open_bucket *ob)
++{
++	unsigned durability =
++		bch_dev_bkey_exists(c, ob->dev)->mi.durability;
++
++	__clear_bit(ob->dev, devs_may_alloc->d);
++	*nr_effective	+= (flags & BUCKET_ALLOC_USE_DURABILITY)
++		? durability : 1;
++	*have_cache	|= !durability;
++
++	ob_push(c, ptrs, ob);
++}
++
++static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
++		      struct open_buckets *ptrs,
++		      struct dev_stripe_state *stripe,
++		      struct bch_devs_mask *devs_may_alloc,
++		      unsigned nr_replicas,
++		      unsigned *nr_effective,
++		      bool *have_cache,
++		      enum alloc_reserve reserve,
++		      unsigned flags,
++		      struct closure *cl)
++{
++	struct bch_fs *c = trans->c;
++	struct dev_alloc_list devs_sorted =
++		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
++	unsigned dev;
++	struct bch_dev *ca;
++	int ret = -BCH_ERR_insufficient_devices;
++	unsigned i;
++
++	BUG_ON(*nr_effective >= nr_replicas);
++
++	for (i = 0; i < devs_sorted.nr; i++) {
++		struct open_bucket *ob;
++
++		dev = devs_sorted.devs[i];
++
++		rcu_read_lock();
++		ca = rcu_dereference(c->devs[dev]);
++		if (ca)
++			percpu_ref_get(&ca->ref);
++		rcu_read_unlock();
++
++		if (!ca)
++			continue;
++
++		if (!ca->mi.durability && *have_cache) {
++			percpu_ref_put(&ca->ref);
++			continue;
++		}
++
++		ob = bch2_bucket_alloc_trans(trans, ca, reserve,
++				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
++		if (!IS_ERR(ob))
++			bch2_dev_stripe_increment(ca, stripe);
++		percpu_ref_put(&ca->ref);
++
++		if (IS_ERR(ob)) {
++			ret = PTR_ERR(ob);
++			if (ret == -EINTR || cl)
++				break;
++			continue;
++		}
++
++		add_new_bucket(c, ptrs, devs_may_alloc,
++			       nr_effective, have_cache, flags, ob);
++
++		if (*nr_effective >= nr_replicas) {
++			ret = 0;
++			break;
++		}
++	}
++
++	return ret;
++}
++
++int bch2_bucket_alloc_set(struct bch_fs *c,
++		      struct open_buckets *ptrs,
++		      struct dev_stripe_state *stripe,
++		      struct bch_devs_mask *devs_may_alloc,
++		      unsigned nr_replicas,
++		      unsigned *nr_effective,
++		      bool *have_cache,
++		      enum alloc_reserve reserve,
++		      unsigned flags,
++		      struct closure *cl)
++{
++	return bch2_trans_do(c, NULL, NULL, 0,
++		      bch2_bucket_alloc_set_trans(&trans, ptrs, stripe,
++					      devs_may_alloc, nr_replicas,
++					      nr_effective, have_cache, reserve,
++					      flags, cl));
++}
++
++/* Allocate from stripes: */
++
++/*
++ * if we can't allocate a new stripe because there are already too many
++ * partially filled stripes, force allocating from an existing stripe even when
++ * it's to a device we don't want:
++ */
++
++static int bucket_alloc_from_stripe(struct bch_fs *c,
++			 struct open_buckets *ptrs,
++			 struct write_point *wp,
++			 struct bch_devs_mask *devs_may_alloc,
++			 u16 target,
++			 unsigned erasure_code,
++			 unsigned nr_replicas,
++			 unsigned *nr_effective,
++			 bool *have_cache,
++			 unsigned flags,
++			 struct closure *cl)
++{
++	struct dev_alloc_list devs_sorted;
++	struct ec_stripe_head *h;
++	struct open_bucket *ob;
++	struct bch_dev *ca;
++	unsigned i, ec_idx;
++
++	if (!erasure_code)
++		return 0;
++
++	if (nr_replicas < 2)
++		return 0;
++
++	if (ec_open_bucket(c, ptrs))
++		return 0;
++
++	h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
++				    wp == &c->copygc_write_point,
++				    cl);
++	if (IS_ERR(h))
++		return -PTR_ERR(h);
++	if (!h)
++		return 0;
++
++	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
++
++	for (i = 0; i < devs_sorted.nr; i++)
++		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
++			if (!h->s->blocks[ec_idx])
++				continue;
++
++			ob = c->open_buckets + h->s->blocks[ec_idx];
++			if (ob->dev == devs_sorted.devs[i] &&
++			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
++				goto got_bucket;
++		}
++	goto out_put_head;
++got_bucket:
++	ca = bch_dev_bkey_exists(c, ob->dev);
++
++	ob->ec_idx	= ec_idx;
++	ob->ec		= h->s;
++
++	add_new_bucket(c, ptrs, devs_may_alloc,
++		       nr_effective, have_cache, flags, ob);
++	atomic_inc(&h->s->pin);
++out_put_head:
++	bch2_ec_stripe_head_put(c, h);
++	return 0;
++}
++
++/* Sector allocator */
++
++static void get_buckets_from_writepoint(struct bch_fs *c,
++					struct open_buckets *ptrs,
++					struct write_point *wp,
++					struct bch_devs_mask *devs_may_alloc,
++					unsigned nr_replicas,
++					unsigned *nr_effective,
++					bool *have_cache,
++					unsigned flags,
++					bool need_ec)
++{
++	struct open_buckets ptrs_skip = { .nr = 0 };
++	struct open_bucket *ob;
++	unsigned i;
++
++	open_bucket_for_each(c, &wp->ptrs, ob, i) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
++
++		if (*nr_effective < nr_replicas &&
++		    test_bit(ob->dev, devs_may_alloc->d) &&
++		    (ca->mi.durability ||
++		     (wp->data_type == BCH_DATA_user && !*have_cache)) &&
++		    (ob->ec || !need_ec)) {
++			add_new_bucket(c, ptrs, devs_may_alloc,
++				       nr_effective, have_cache,
++				       flags, ob);
++		} else {
++			ob_push(c, &ptrs_skip, ob);
++		}
++	}
++	wp->ptrs = ptrs_skip;
++}
++
++static int open_bucket_add_buckets(struct btree_trans *trans,
++			struct open_buckets *ptrs,
++			struct write_point *wp,
++			struct bch_devs_list *devs_have,
++			u16 target,
++			unsigned erasure_code,
++			unsigned nr_replicas,
++			unsigned *nr_effective,
++			bool *have_cache,
++			enum alloc_reserve reserve,
++			unsigned flags,
++			struct closure *_cl)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_devs_mask devs;
++	struct open_bucket *ob;
++	struct closure *cl = NULL;
++	int ret;
++	unsigned i;
++
++	rcu_read_lock();
++	devs = target_rw_devs(c, wp->data_type, target);
++	rcu_read_unlock();
++
++	/* Don't allocate from devices we already have pointers to: */
++	for (i = 0; i < devs_have->nr; i++)
++		__clear_bit(devs_have->devs[i], devs.d);
++
++	open_bucket_for_each(c, ptrs, ob, i)
++		__clear_bit(ob->dev, devs.d);
++
++	if (erasure_code) {
++		if (!ec_open_bucket(c, ptrs)) {
++			get_buckets_from_writepoint(c, ptrs, wp, &devs,
++						    nr_replicas, nr_effective,
++						    have_cache, flags, true);
++			if (*nr_effective >= nr_replicas)
++				return 0;
++		}
++
++		if (!ec_open_bucket(c, ptrs)) {
++			ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
++						 target, erasure_code,
++						 nr_replicas, nr_effective,
++						 have_cache, flags, _cl);
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
++			    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
++			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
++				return ret;
++			if (*nr_effective >= nr_replicas)
++				return 0;
++		}
++	}
++
++	get_buckets_from_writepoint(c, ptrs, wp, &devs,
++				    nr_replicas, nr_effective,
++				    have_cache, flags, false);
++	if (*nr_effective >= nr_replicas)
++		return 0;
++
++retry_blocking:
++	/*
++	 * Try nonblocking first, so that if one device is full we'll try from
++	 * other devices:
++	 */
++	ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
++				nr_replicas, nr_effective, have_cache,
++				reserve, flags, cl);
++	if (ret &&
++	    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
++	    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
++	    !cl && _cl) {
++		cl = _cl;
++		goto retry_blocking;
++	}
++
++	return ret;
++}
++
++void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
++				struct open_buckets *obs)
++{
++	struct open_buckets ptrs = { .nr = 0 };
++	struct open_bucket *ob, *ob2;
++	unsigned i, j;
++
++	open_bucket_for_each(c, obs, ob, i) {
++		bool drop = !ca || ob->dev == ca->dev_idx;
++
++		if (!drop && ob->ec) {
++			mutex_lock(&ob->ec->lock);
++			for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
++				if (!ob->ec->blocks[j])
++					continue;
++
++				ob2 = c->open_buckets + ob->ec->blocks[j];
++				drop |= ob2->dev == ca->dev_idx;
++			}
++			mutex_unlock(&ob->ec->lock);
++		}
++
++		if (drop)
++			bch2_open_bucket_put(c, ob);
++		else
++			ob_push(c, &ptrs, ob);
++	}
++
++	*obs = ptrs;
++}
++
++void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
++			  struct write_point *wp)
++{
++	mutex_lock(&wp->lock);
++	bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
++	mutex_unlock(&wp->lock);
++}
++
++static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
++						 unsigned long write_point)
++{
++	unsigned hash =
++		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
++
++	return &c->write_points_hash[hash];
++}
++
++static struct write_point *__writepoint_find(struct hlist_head *head,
++					     unsigned long write_point)
++{
++	struct write_point *wp;
++
++	rcu_read_lock();
++	hlist_for_each_entry_rcu(wp, head, node)
++		if (wp->write_point == write_point)
++			goto out;
++	wp = NULL;
++out:
++	rcu_read_unlock();
++	return wp;
++}
++
++static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
++{
++	u64 stranded	= c->write_points_nr * c->bucket_size_max;
++	u64 free	= bch2_fs_usage_read_short(c).free;
++
++	return stranded * factor > free;
++}
++
++static bool try_increase_writepoints(struct bch_fs *c)
++{
++	struct write_point *wp;
++
++	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
++	    too_many_writepoints(c, 32))
++		return false;
++
++	wp = c->write_points + c->write_points_nr++;
++	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
++	return true;
++}
++
++static bool try_decrease_writepoints(struct bch_fs *c,
++				     unsigned old_nr)
++{
++	struct write_point *wp;
++
++	mutex_lock(&c->write_points_hash_lock);
++	if (c->write_points_nr < old_nr) {
++		mutex_unlock(&c->write_points_hash_lock);
++		return true;
++	}
++
++	if (c->write_points_nr == 1 ||
++	    !too_many_writepoints(c, 8)) {
++		mutex_unlock(&c->write_points_hash_lock);
++		return false;
++	}
++
++	wp = c->write_points + --c->write_points_nr;
++
++	hlist_del_rcu(&wp->node);
++	mutex_unlock(&c->write_points_hash_lock);
++
++	bch2_writepoint_stop(c, NULL, wp);
++	return true;
++}
++
++static void bch2_trans_mutex_lock(struct btree_trans *trans,
++				  struct mutex *lock)
++{
++	if (!mutex_trylock(lock)) {
++		bch2_trans_unlock(trans);
++		mutex_lock(lock);
++	}
++}
++
++static struct write_point *writepoint_find(struct btree_trans *trans,
++					   unsigned long write_point)
++{
++	struct bch_fs *c = trans->c;
++	struct write_point *wp, *oldest;
++	struct hlist_head *head;
++
++	if (!(write_point & 1UL)) {
++		wp = (struct write_point *) write_point;
++		bch2_trans_mutex_lock(trans, &wp->lock);
++		return wp;
++	}
++
++	head = writepoint_hash(c, write_point);
++restart_find:
++	wp = __writepoint_find(head, write_point);
++	if (wp) {
++lock_wp:
++		bch2_trans_mutex_lock(trans, &wp->lock);
++		if (wp->write_point == write_point)
++			goto out;
++		mutex_unlock(&wp->lock);
++		goto restart_find;
++	}
++restart_find_oldest:
++	oldest = NULL;
++	for (wp = c->write_points;
++	     wp < c->write_points + c->write_points_nr; wp++)
++		if (!oldest || time_before64(wp->last_used, oldest->last_used))
++			oldest = wp;
++
++	bch2_trans_mutex_lock(trans, &oldest->lock);
++	bch2_trans_mutex_lock(trans, &c->write_points_hash_lock);
++	if (oldest >= c->write_points + c->write_points_nr ||
++	    try_increase_writepoints(c)) {
++		mutex_unlock(&c->write_points_hash_lock);
++		mutex_unlock(&oldest->lock);
++		goto restart_find_oldest;
++	}
++
++	wp = __writepoint_find(head, write_point);
++	if (wp && wp != oldest) {
++		mutex_unlock(&c->write_points_hash_lock);
++		mutex_unlock(&oldest->lock);
++		goto lock_wp;
++	}
++
++	wp = oldest;
++	hlist_del_rcu(&wp->node);
++	wp->write_point = write_point;
++	hlist_add_head_rcu(&wp->node, head);
++	mutex_unlock(&c->write_points_hash_lock);
++out:
++	wp->last_used = sched_clock();
++	return wp;
++}
++
++/*
++ * Get us an open_bucket we can allocate from, return with it locked:
++ */
++struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans,
++				unsigned target,
++				unsigned erasure_code,
++				struct write_point_specifier write_point,
++				struct bch_devs_list *devs_have,
++				unsigned nr_replicas,
++				unsigned nr_replicas_required,
++				enum alloc_reserve reserve,
++				unsigned flags,
++				struct closure *cl)
++{
++	struct bch_fs *c = trans->c;
++	struct write_point *wp;
++	struct open_bucket *ob;
++	struct open_buckets ptrs;
++	unsigned nr_effective, write_points_nr;
++	unsigned ob_flags = 0;
++	bool have_cache;
++	int ret;
++	int i;
++
++	if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
++		ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
++
++	BUG_ON(!nr_replicas || !nr_replicas_required);
++retry:
++	ptrs.nr		= 0;
++	nr_effective	= 0;
++	write_points_nr = c->write_points_nr;
++	have_cache	= false;
++
++	wp = writepoint_find(trans, write_point.v);
++
++	if (wp->data_type == BCH_DATA_user)
++		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
++
++	/* metadata may not allocate on cache devices: */
++	if (wp->data_type != BCH_DATA_user)
++		have_cache = true;
++
++	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
++		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
++					      target, erasure_code,
++					      nr_replicas, &nr_effective,
++					      &have_cache, reserve,
++					      ob_flags, cl);
++	} else {
++		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
++					      target, erasure_code,
++					      nr_replicas, &nr_effective,
++					      &have_cache, reserve,
++					      ob_flags, NULL);
++		if (!ret || ret == -EINTR)
++			goto alloc_done;
++
++		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
++					      0, erasure_code,
++					      nr_replicas, &nr_effective,
++					      &have_cache, reserve,
++					      ob_flags, cl);
++	}
++alloc_done:
++	BUG_ON(!ret && nr_effective < nr_replicas);
++
++	if (erasure_code && !ec_open_bucket(c, &ptrs))
++		pr_debug("failed to get ec bucket: ret %u", ret);
++
++	if (ret == -BCH_ERR_insufficient_devices &&
++	    nr_effective >= nr_replicas_required)
++		ret = 0;
++
++	if (ret)
++		goto err;
++
++	/* Free buckets we didn't use: */
++	open_bucket_for_each(c, &wp->ptrs, ob, i)
++		open_bucket_free_unused(c, wp, ob);
++
++	wp->ptrs = ptrs;
++
++	wp->sectors_free = UINT_MAX;
++
++	open_bucket_for_each(c, &wp->ptrs, ob, i)
++		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
++
++	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
++
++	return wp;
++err:
++	open_bucket_for_each(c, &wp->ptrs, ob, i)
++		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
++			ob_push(c, &ptrs, ob);
++		else
++			open_bucket_free_unused(c, wp, ob);
++	wp->ptrs = ptrs;
++
++	mutex_unlock(&wp->lock);
++
++	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
++	    try_decrease_writepoints(c, write_points_nr))
++		goto retry;
++
++	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
++	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
++		return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
++
++	if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
++		return ERR_PTR(-EROFS);
++
++	return ERR_PTR(ret);
++}
++
++struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
++				unsigned target,
++				unsigned erasure_code,
++				struct write_point_specifier write_point,
++				struct bch_devs_list *devs_have,
++				unsigned nr_replicas,
++				unsigned nr_replicas_required,
++				enum alloc_reserve reserve,
++				unsigned flags,
++				struct closure *cl)
++{
++	struct write_point *wp;
++
++	bch2_trans_do(c, NULL, NULL, 0,
++		      PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target,
++							erasure_code,
++							write_point,
++							devs_have,
++							nr_replicas,
++							nr_replicas_required,
++							reserve,
++							flags, cl)));
++	return wp;
++
++}
++
++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
++
++	return (struct bch_extent_ptr) {
++		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
++		.gen	= ob->gen,
++		.dev	= ob->dev,
++		.offset	= bucket_to_sector(ca, ob->bucket) +
++			ca->mi.bucket_size -
++			ob->sectors_free,
++	};
++}
++
++/*
++ * Append pointers to the space we just allocated to @k, and mark @sectors space
++ * as allocated out of @ob
++ */
++void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
++				    struct bkey_i *k, unsigned sectors,
++				    bool cached)
++
++{
++	struct open_bucket *ob;
++	unsigned i;
++
++	BUG_ON(sectors > wp->sectors_free);
++	wp->sectors_free -= sectors;
++
++	open_bucket_for_each(c, &wp->ptrs, ob, i) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
++		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
++
++		ptr.cached = cached ||
++			(!ca->mi.durability &&
++			 wp->data_type == BCH_DATA_user);
++
++		bch2_bkey_append_ptr(k, ptr);
++
++		BUG_ON(sectors > ob->sectors_free);
++		ob->sectors_free -= sectors;
++	}
++}
++
++/*
++ * Append pointers to the space we just allocated to @k, and mark @sectors space
++ * as allocated out of @ob
++ */
++void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
++{
++	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
++	struct open_bucket *ob;
++	unsigned i;
++
++	open_bucket_for_each(c, &wp->ptrs, ob, i)
++		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
++	wp->ptrs = keep;
++
++	mutex_unlock(&wp->lock);
++
++	bch2_open_buckets_put(c, &ptrs);
++}
++
++static inline void writepoint_init(struct write_point *wp,
++				   enum bch_data_type type)
++{
++	mutex_init(&wp->lock);
++	wp->data_type = type;
++}
++
++void bch2_fs_allocator_foreground_init(struct bch_fs *c)
++{
++	struct open_bucket *ob;
++	struct write_point *wp;
++
++	mutex_init(&c->write_points_hash_lock);
++	c->write_points_nr = ARRAY_SIZE(c->write_points);
++
++	/* open bucket 0 is a sentinal NULL: */
++	spin_lock_init(&c->open_buckets[0].lock);
++
++	for (ob = c->open_buckets + 1;
++	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
++		spin_lock_init(&ob->lock);
++		c->open_buckets_nr_free++;
++
++		ob->freelist = c->open_buckets_freelist;
++		c->open_buckets_freelist = ob - c->open_buckets;
++	}
++
++	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
++	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
++	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
++
++	for (wp = c->write_points;
++	     wp < c->write_points + c->write_points_nr; wp++) {
++		writepoint_init(wp, BCH_DATA_user);
++
++		wp->last_used	= sched_clock();
++		wp->write_point	= (unsigned long) wp;
++		hlist_add_head_rcu(&wp->node,
++				   writepoint_hash(c, wp->write_point));
++	}
++}
++
++void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	struct open_bucket *ob;
++
++	for (ob = c->open_buckets;
++	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
++	     ob++) {
++		spin_lock(&ob->lock);
++		if (ob->valid && !ob->on_partial_list) {
++			prt_printf(out, "%zu ref %u type %s %u:%llu:%u\n",
++			       ob - c->open_buckets,
++			       atomic_read(&ob->pin),
++			       bch2_data_types[ob->data_type],
++			       ob->dev, ob->bucket, ob->gen);
++		}
++		spin_unlock(&ob->lock);
++	}
++}
+diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
+new file mode 100644
+index 000000000000..6de63a351fa8
+--- /dev/null
++++ b/fs/bcachefs/alloc_foreground.h
+@@ -0,0 +1,181 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
++#define _BCACHEFS_ALLOC_FOREGROUND_H
++
++#include "bcachefs.h"
++#include "alloc_types.h"
++
++#include <linux/hash.h>
++
++struct bkey;
++struct bch_dev;
++struct bch_fs;
++struct bch_devs_List;
++
++extern const char * const bch2_alloc_reserves[];
++
++struct dev_alloc_list {
++	unsigned	nr;
++	u8		devs[BCH_SB_MEMBERS_MAX];
++};
++
++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
++					  struct dev_stripe_state *,
++					  struct bch_devs_mask *);
++void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
++
++long bch2_bucket_alloc_new_fs(struct bch_dev *);
++
++struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
++				      enum alloc_reserve, bool,
++				      struct closure *);
++
++static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
++			   struct open_bucket *ob)
++{
++	BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
++
++	obs->v[obs->nr++] = ob - c->open_buckets;
++}
++
++#define open_bucket_for_each(_c, _obs, _ob, _i)				\
++	for ((_i) = 0;							\
++	     (_i) < (_obs)->nr &&					\
++	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
++	     (_i)++)
++
++static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
++						 struct open_buckets *obs)
++{
++	struct open_bucket *ob;
++	unsigned i;
++
++	open_bucket_for_each(c, obs, ob, i)
++		if (ob->ec)
++			return ob;
++
++	return NULL;
++}
++
++void bch2_open_bucket_write_error(struct bch_fs *,
++			struct open_buckets *, unsigned);
++
++void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
++
++static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
++{
++	if (atomic_dec_and_test(&ob->pin))
++		__bch2_open_bucket_put(c, ob);
++}
++
++static inline void bch2_open_buckets_put(struct bch_fs *c,
++					 struct open_buckets *ptrs)
++{
++	struct open_bucket *ob;
++	unsigned i;
++
++	open_bucket_for_each(c, ptrs, ob, i)
++		bch2_open_bucket_put(c, ob);
++	ptrs->nr = 0;
++}
++
++static inline void bch2_open_bucket_get(struct bch_fs *c,
++					struct write_point *wp,
++					struct open_buckets *ptrs)
++{
++	struct open_bucket *ob;
++	unsigned i;
++
++	open_bucket_for_each(c, &wp->ptrs, ob, i) {
++		ob->data_type = wp->data_type;
++		atomic_inc(&ob->pin);
++		ob_push(c, ptrs, ob);
++	}
++}
++
++static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
++						  unsigned dev, u64 bucket)
++{
++	return c->open_buckets_hash +
++		(jhash_3words(dev, bucket, bucket >> 32, 0) &
++		 (OPEN_BUCKETS_COUNT - 1));
++}
++
++static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
++{
++	open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
++
++	while (slot) {
++		struct open_bucket *ob = &c->open_buckets[slot];
++
++		if (ob->dev == dev && ob->bucket == bucket)
++			return true;
++
++		slot = ob->hash;
++	}
++
++	return false;
++}
++
++static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
++{
++	bool ret;
++
++	if (bch2_bucket_is_open(c, dev, bucket))
++		return true;
++
++	spin_lock(&c->freelist_lock);
++	ret = bch2_bucket_is_open(c, dev, bucket);
++	spin_unlock(&c->freelist_lock);
++
++	return ret;
++}
++
++int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
++		      struct dev_stripe_state *, struct bch_devs_mask *,
++		      unsigned, unsigned *, bool *, enum alloc_reserve,
++		      unsigned, struct closure *);
++
++struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *,
++					     unsigned, unsigned,
++					     struct write_point_specifier,
++					     struct bch_devs_list *,
++					     unsigned, unsigned,
++					     enum alloc_reserve,
++					     unsigned,
++					     struct closure *);
++struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
++					     unsigned, unsigned,
++					     struct write_point_specifier,
++					     struct bch_devs_list *,
++					     unsigned, unsigned,
++					     enum alloc_reserve,
++					     unsigned,
++					     struct closure *);
++
++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
++void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
++				    struct bkey_i *, unsigned, bool);
++void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
++
++void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
++				struct open_buckets *);
++
++void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
++			  struct write_point *);
++
++static inline struct write_point_specifier writepoint_hashed(unsigned long v)
++{
++	return (struct write_point_specifier) { .v = v | 1 };
++}
++
++static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
++{
++	return (struct write_point_specifier) { .v = (unsigned long) wp };
++}
++
++void bch2_fs_allocator_foreground_init(struct bch_fs *);
++
++void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
++
++#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
+diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
+new file mode 100644
+index 000000000000..e078584d46f6
+--- /dev/null
++++ b/fs/bcachefs/alloc_types.h
+@@ -0,0 +1,87 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_ALLOC_TYPES_H
++#define _BCACHEFS_ALLOC_TYPES_H
++
++#include <linux/mutex.h>
++#include <linux/spinlock.h>
++
++#include "clock_types.h"
++#include "fifo.h"
++
++struct ec_bucket_buf;
++
++#define BCH_ALLOC_RESERVES()		\
++	x(btree_movinggc)		\
++	x(btree)			\
++	x(movinggc)			\
++	x(none)
++
++enum alloc_reserve {
++#define x(name)	RESERVE_##name,
++	BCH_ALLOC_RESERVES()
++#undef x
++};
++
++#define OPEN_BUCKETS_COUNT	1024
++
++#define WRITE_POINT_HASH_NR	32
++#define WRITE_POINT_MAX		32
++
++/*
++ * 0 is never a valid open_bucket_idx_t:
++ */
++typedef u16			open_bucket_idx_t;
++
++struct open_bucket {
++	spinlock_t		lock;
++	atomic_t		pin;
++	open_bucket_idx_t	freelist;
++	open_bucket_idx_t	hash;
++
++	/*
++	 * When an open bucket has an ec_stripe attached, this is the index of
++	 * the block in the stripe this open_bucket corresponds to:
++	 */
++	u8			ec_idx;
++	enum bch_data_type	data_type:8;
++	unsigned		valid:1;
++	unsigned		on_partial_list:1;
++	unsigned		alloc_reserve:3;
++
++	u8			dev;
++	u8			gen;
++	u32			sectors_free;
++	u64			bucket;
++	struct ec_stripe_new	*ec;
++};
++
++#define OPEN_BUCKET_LIST_MAX	15
++
++struct open_buckets {
++	open_bucket_idx_t	nr;
++	open_bucket_idx_t	v[OPEN_BUCKET_LIST_MAX];
++};
++
++struct dev_stripe_state {
++	u64			next_alloc[BCH_SB_MEMBERS_MAX];
++};
++
++struct write_point {
++	struct hlist_node	node;
++	struct mutex		lock;
++	u64			last_used;
++	unsigned long		write_point;
++	enum bch_data_type	data_type;
++
++	/* calculated based on how many pointers we're actually going to use: */
++	unsigned		sectors_free;
++
++	struct open_buckets	ptrs;
++	struct dev_stripe_state	stripe;
++};
++
++struct write_point_specifier {
++	unsigned long		v;
++};
++
++#endif /* _BCACHEFS_ALLOC_TYPES_H */
+diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
+new file mode 100644
+index 000000000000..5a46b25b0587
+--- /dev/null
++++ b/fs/bcachefs/backpointers.c
+@@ -0,0 +1,875 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "backpointers.h"
++#include "btree_cache.h"
++#include "btree_update.h"
++#include "error.h"
++
++#define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
++
++/*
++ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
++ * btree:
++ */
++static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
++					   struct bpos bp_pos)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
++	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
++
++	return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
++}
++
++/*
++ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
++ */
++static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
++					   struct bpos bucket,
++					   u64 bucket_offset)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
++
++	return POS(bucket.inode,
++		   (bucket_to_sector(ca, bucket.offset) <<
++		    MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
++}
++
++void bch2_extent_ptr_to_bp(struct bch_fs *c,
++			   enum btree_id btree_id, unsigned level,
++			   struct bkey_s_c k, struct extent_ptr_decoded p,
++			   struct bpos *bucket_pos, struct bch_backpointer *bp)
++{
++	enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
++	s64 sectors = level ? btree_sectors(c) : k.k->size;
++	u32 bucket_offset;
++
++	*bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
++	*bp = (struct bch_backpointer) {
++		.btree_id	= btree_id,
++		.level		= level,
++		.data_type	= data_type,
++		.bucket_offset	= ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
++			p.crc.offset,
++		.bucket_len	= ptr_disk_sectors(sectors, p),
++		.pos		= k.k->p,
++	};
++}
++
++static bool extent_matches_bp(struct bch_fs *c,
++			      enum btree_id btree_id, unsigned level,
++			      struct bkey_s_c k,
++			      struct bpos bucket,
++			      struct bch_backpointer bp)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++		struct bpos bucket2;
++		struct bch_backpointer bp2;
++
++		if (p.ptr.cached)
++			continue;
++
++		bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
++				      &bucket2, &bp2);
++		if (!bpos_cmp(bucket, bucket2) &&
++		    !memcmp(&bp, &bp2, sizeof(bp)))
++			return true;
++	}
++
++	return false;
++}
++
++int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			     int rw, struct printbuf *err)
++{
++	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
++	struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
++
++	if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) {
++		prt_str(err, "incorrect value size");
++		return -EINVAL;
++	}
++
++	if (bpos_cmp(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
++		prt_str(err, "backpointer at wrong pos");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
++{
++	prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
++	       bch2_btree_ids[bp->btree_id],
++	       bp->level,
++	       (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
++	       (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
++	       bp->bucket_len);
++	bch2_bpos_to_text(out, bp->pos);
++}
++
++void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
++{
++	bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
++}
++
++void bch2_backpointer_swab(struct bkey_s k)
++{
++	struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
++
++	bp.v->bucket_offset	= swab32(bp.v->bucket_offset);
++	bp.v->bucket_len	= swab32(bp.v->bucket_len);
++	bch2_bpos_swab(&bp.v->pos);
++}
++
++#define BACKPOINTER_OFFSET_MAX	((1ULL << 40) - 1)
++
++static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r)
++{
++	return cmp_int(l.bucket_offset, r.bucket_offset);
++}
++
++static int bch2_backpointer_del_by_offset(struct btree_trans *trans,
++					  struct bpos bucket,
++					  u64 bp_offset,
++					  struct bch_backpointer bp)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	if (bp_offset < BACKPOINTER_OFFSET_MAX) {
++		struct bch_backpointer *bps;
++		struct bkey_i_alloc_v4 *a;
++		unsigned i, nr;
++
++		bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
++				     bucket,
++				     BTREE_ITER_INTENT|
++				     BTREE_ITER_SLOTS|
++				     BTREE_ITER_WITH_UPDATES);
++		k = bch2_btree_iter_peek_slot(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (k.k->type != KEY_TYPE_alloc_v4) {
++			ret = -ENOENT;
++			goto err;
++		}
++
++		a = bch2_alloc_to_v4_mut(trans, k);
++		ret = PTR_ERR_OR_ZERO(a);
++		if (ret)
++			goto err;
++		bps = alloc_v4_backpointers(&a->v);
++		nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
++
++		for (i = 0; i < nr; i++) {
++			if (bps[i].bucket_offset == bp_offset)
++				goto found;
++			if (bps[i].bucket_offset > bp_offset)
++				break;
++		}
++
++		ret = -ENOENT;
++		goto err;
++found:
++		if (memcmp(&bps[i], &bp, sizeof(bp))) {
++			ret = -ENOENT;
++			goto err;
++		}
++		array_remove_item(bps, nr, i);
++		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
++		set_alloc_v4_u64s(a);
++		ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
++	} else {
++		bp_offset -= BACKPOINTER_OFFSET_MAX;
++
++		bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers,
++				     bucket_pos_to_bp(c, bucket, bp_offset),
++				     BTREE_ITER_INTENT|
++				     BTREE_ITER_SLOTS|
++				     BTREE_ITER_WITH_UPDATES);
++		k = bch2_btree_iter_peek_slot(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (k.k->type != KEY_TYPE_backpointer ||
++		    memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
++			ret = -ENOENT;
++			goto err;
++		}
++
++		ret = bch2_btree_delete_at(trans, &iter, 0);
++	}
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_bucket_backpointer_del(struct btree_trans *trans,
++				struct bkey_i_alloc_v4 *a,
++				struct bch_backpointer bp,
++				struct bkey_s_c orig_k)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
++	unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
++	struct btree_iter bp_iter;
++	struct bkey_s_c k;
++	int ret;
++
++	for (i = 0; i < nr; i++) {
++		int cmp = backpointer_cmp(bps[i], bp) ?:
++			memcmp(&bps[i], &bp, sizeof(bp));
++		if (!cmp)
++			goto found;
++		if (cmp >= 0)
++			break;
++	}
++
++	goto btree;
++found:
++	array_remove_item(bps, nr, i);
++	SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
++	set_alloc_v4_u64s(a);
++	return 0;
++btree:
++	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
++			     bucket_pos_to_bp(c, a->k.p, bp.bucket_offset),
++			     BTREE_ITER_INTENT|
++			     BTREE_ITER_SLOTS|
++			     BTREE_ITER_WITH_UPDATES);
++	k = bch2_btree_iter_peek_slot(&bp_iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_backpointer ||
++	    memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
++		struct printbuf buf = PRINTBUF;
++
++		prt_printf(&buf, "backpointer not found when deleting");
++		prt_newline(&buf);
++		printbuf_indent_add(&buf, 2);
++
++		prt_printf(&buf, "searching for ");
++		bch2_backpointer_to_text(&buf, &bp);
++		prt_newline(&buf);
++
++		prt_printf(&buf, "got ");
++		bch2_bkey_val_to_text(&buf, c, k);
++		prt_newline(&buf);
++
++		prt_str(&buf, "alloc ");
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
++		prt_newline(&buf);
++
++		prt_printf(&buf, "for ");
++		bch2_bkey_val_to_text(&buf, c, orig_k);
++
++		if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
++			bch_err(c, "%s", buf.buf);
++		} else {
++			ret = -EIO;
++			bch2_trans_inconsistent(trans, "%s", buf.buf);
++		}
++		printbuf_exit(&buf);
++		goto err;
++	}
++
++	ret = bch2_btree_delete_at(trans, &bp_iter, 0);
++err:
++	bch2_trans_iter_exit(trans, &bp_iter);
++	return ret;
++}
++
++int bch2_bucket_backpointer_add(struct btree_trans *trans,
++				struct bkey_i_alloc_v4 *a,
++				struct bch_backpointer bp,
++				struct bkey_s_c orig_k)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_dev *ca;
++	struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
++	unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
++	struct bkey_i_backpointer *bp_k;
++	struct btree_iter bp_iter;
++	struct bkey_s_c k;
++	int ret;
++
++	/* Check for duplicates: */
++	for (i = 0; i < nr; i++) {
++		int cmp = backpointer_cmp(bps[i], bp);
++		if (cmp >= 0)
++			break;
++	}
++
++	if ((i &&
++	     (bps[i - 1].bucket_offset +
++	      bps[i - 1].bucket_len > bp.bucket_offset)) ||
++	    (i < nr &&
++	     (bp.bucket_offset + bp.bucket_len > bps[i].bucket_offset))) {
++		struct printbuf buf = PRINTBUF;
++
++		prt_printf(&buf, "overlapping backpointer found when inserting ");
++		bch2_backpointer_to_text(&buf, &bp);
++		prt_newline(&buf);
++		printbuf_indent_add(&buf, 2);
++
++		prt_printf(&buf, "into ");
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
++		prt_newline(&buf);
++
++		prt_printf(&buf, "for ");
++		bch2_bkey_val_to_text(&buf, c, orig_k);
++
++		if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
++			bch_err(c, "%s", buf.buf);
++		else {
++			bch2_trans_inconsistent(trans, "%s", buf.buf);
++			printbuf_exit(&buf);
++			return -EIO;
++		}
++	}
++
++	if (nr < BCH_ALLOC_V4_NR_BACKPOINTERS_MAX) {
++		array_insert_item(bps, nr, i, bp);
++		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
++		set_alloc_v4_u64s(a);
++		return 0;
++	}
++
++	/* Overflow: use backpointer btree */
++	bp_k = bch2_trans_kmalloc(trans, sizeof(*bp_k));
++	ret = PTR_ERR_OR_ZERO(bp_k);
++	if (ret)
++		return ret;
++
++	ca = bch_dev_bkey_exists(c, a->k.p.inode);
++
++	bkey_backpointer_init(&bp_k->k_i);
++	bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
++	bp_k->v = bp;
++
++	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p,
++			     BTREE_ITER_INTENT|
++			     BTREE_ITER_SLOTS|
++			     BTREE_ITER_WITH_UPDATES);
++	k = bch2_btree_iter_peek_slot(&bp_iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type) {
++		struct printbuf buf = PRINTBUF;
++
++		prt_printf(&buf, "existing btree backpointer key found when inserting ");
++		bch2_backpointer_to_text(&buf, &bp);
++		prt_newline(&buf);
++		printbuf_indent_add(&buf, 2);
++
++		prt_printf(&buf, "found ");
++		bch2_bkey_val_to_text(&buf, c, k);
++		prt_newline(&buf);
++
++		prt_printf(&buf, "for ");
++		bch2_bkey_val_to_text(&buf, c, orig_k);
++
++		if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
++			bch_err(c, "%s", buf.buf);
++		else {
++			bch2_trans_inconsistent(trans, "%s", buf.buf);
++			printbuf_exit(&buf);
++			ret = -EIO;
++			goto err;
++		}
++	}
++
++	ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
++err:
++	bch2_trans_iter_exit(trans, &bp_iter);
++	return ret;
++}
++
++/*
++ * Find the next backpointer >= *bp_offset:
++ */
++int bch2_get_next_backpointer(struct btree_trans *trans,
++			      struct bpos bucket, int gen,
++			      u64 *bp_offset,
++			      struct bch_backpointer *dst)
++{
++	struct bch_fs *c = trans->c;
++	struct bpos bp_pos =
++		bucket_pos_to_bp(c, bucket,
++				max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
++	struct bpos bp_end_pos =
++		bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
++	struct btree_iter alloc_iter, bp_iter = { NULL };
++	struct bkey_s_c k;
++	struct bkey_s_c_alloc_v4 a;
++	size_t i;
++	int ret;
++
++	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
++			     bucket, BTREE_ITER_CACHED);
++	k = bch2_btree_iter_peek_slot(&alloc_iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto out;
++
++	if (k.k->type != KEY_TYPE_alloc_v4)
++		goto done;
++
++	a = bkey_s_c_to_alloc_v4(k);
++	if (gen >= 0 && a.v->gen != gen)
++		goto done;
++
++	for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) {
++		if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset)
++			continue;
++
++		*dst = alloc_v4_backpointers_c(a.v)[i];
++		*bp_offset = dst->bucket_offset;
++		goto out;
++	}
++
++	for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
++				     bp_pos, 0, k, ret) {
++		if (bpos_cmp(k.k->p, bp_end_pos) >= 0)
++			break;
++
++		if (k.k->type != KEY_TYPE_backpointer)
++			continue;
++
++		*dst = *bkey_s_c_to_backpointer(k).v;
++		*bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX;
++		goto out;
++	}
++done:
++	*bp_offset = U64_MAX;
++out:
++	bch2_trans_iter_exit(trans, &bp_iter);
++	bch2_trans_iter_exit(trans, &alloc_iter);
++	return ret;
++}
++
++static void backpointer_not_found(struct btree_trans *trans,
++				  struct bpos bucket,
++				  u64 bp_offset,
++				  struct bch_backpointer bp,
++				  struct bkey_s_c k,
++				  const char *thing_it_points_to)
++{
++	struct bch_fs *c = trans->c;
++	struct printbuf buf = PRINTBUF;
++
++	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
++		   thing_it_points_to);
++	prt_printf(&buf, "bucket: ");
++	bch2_bpos_to_text(&buf, bucket);
++	prt_printf(&buf, "\n  ");
++
++	if (bp_offset >= BACKPOINTER_OFFSET_MAX) {
++		struct bpos bp_pos =
++			bucket_pos_to_bp(c, bucket,
++					bp_offset - BACKPOINTER_OFFSET_MAX);
++		prt_printf(&buf, "backpointer pos: ");
++		bch2_bpos_to_text(&buf, bp_pos);
++		prt_printf(&buf, "\n  ");
++	}
++
++	bch2_backpointer_to_text(&buf, &bp);
++	prt_printf(&buf, "\n  ");
++	bch2_bkey_val_to_text(&buf, c, k);
++	if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
++		bch_err(c, "%s", buf.buf);
++	else
++		bch2_trans_inconsistent(trans, "%s", buf.buf);
++
++	printbuf_exit(&buf);
++}
++
++struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
++					 struct btree_iter *iter,
++					 struct bpos bucket,
++					 u64 bp_offset,
++					 struct bch_backpointer bp)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c k;
++
++	bch2_trans_node_iter_init(trans, iter,
++				  bp.btree_id,
++				  bp.pos,
++				  0,
++				  min(bp.level, c->btree_roots[bp.btree_id].level),
++				  0);
++	k = bch2_btree_iter_peek_slot(iter);
++	if (bkey_err(k)) {
++		bch2_trans_iter_exit(trans, iter);
++		return k;
++	}
++
++	if (bp.level == c->btree_roots[bp.btree_id].level + 1)
++		k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key);
++
++	if (extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
++		return k;
++
++	backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent");
++
++	bch2_trans_iter_exit(trans, iter);
++	return bkey_s_c_null;
++}
++
++struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
++					struct btree_iter *iter,
++					struct bpos bucket,
++					u64 bp_offset,
++					struct bch_backpointer bp)
++{
++	struct bch_fs *c = trans->c;
++	struct btree *b;
++	struct bkey_s_c k;
++
++	BUG_ON(!bp.level);
++
++	bch2_trans_node_iter_init(trans, iter,
++				  bp.btree_id,
++				  bp.pos,
++				  0,
++				  bp.level - 1,
++				  0);
++	b = bch2_btree_iter_peek_node(iter);
++	if (IS_ERR(b)) {
++		bch2_trans_iter_exit(trans, iter);
++		return b;
++	}
++
++	if (extent_matches_bp(c, bp.btree_id, bp.level,
++			      bkey_i_to_s_c(&b->key),
++			      bucket, bp))
++		return b;
++
++	if (!btree_node_will_make_reachable(b))
++		backpointer_not_found(trans, bucket, bp_offset,
++				      bp, k, "btree node");
++
++	bch2_trans_iter_exit(trans, iter);
++	return NULL;
++}
++
++static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
++					struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter alloc_iter = { NULL };
++	struct bch_dev *ca;
++	struct bkey_s_c alloc_k;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
++			"backpointer for mising device:\n%s",
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, bp_iter, 0);
++		goto out;
++	}
++
++	ca = bch_dev_bkey_exists(c, k.k->p.inode);
++
++	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
++			     bp_pos_to_bucket(c, k.k->p), 0);
++
++	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
++	ret = bkey_err(alloc_k);
++	if (ret)
++		goto out;
++
++	if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
++			"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
++			alloc_iter.pos.inode, alloc_iter.pos.offset,
++			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, bp_iter, 0);
++		goto out;
++	}
++out:
++fsck_err:
++	bch2_trans_iter_exit(trans, &alloc_iter);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++/* verify that every backpointer has a corresponding alloc key */
++int bch2_check_btree_backpointers(struct bch_fs *c)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++
++	return bch2_trans_run(c,
++		for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_backpointers, POS_MIN, 0, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		  bch2_check_btree_backpointer(&trans, &iter, k)));
++}
++
++static int check_bp_exists(struct btree_trans *trans,
++			   struct bpos bucket_pos,
++			   struct bch_backpointer bp,
++			   struct bkey_s_c orig_k)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter alloc_iter, bp_iter = { NULL };
++	struct printbuf buf = PRINTBUF;
++	struct bkey_s_c alloc_k, bp_k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0);
++	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
++	ret = bkey_err(alloc_k);
++	if (ret)
++		goto err;
++
++	if (alloc_k.k->type == KEY_TYPE_alloc_v4) {
++		struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k);
++		const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v);
++		unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v);
++
++		for (i = 0; i < nr; i++) {
++			int cmp = backpointer_cmp(bps[i], bp) ?:
++				memcmp(&bps[i], &bp, sizeof(bp));
++			if (!cmp)
++				goto out;
++			if (cmp >= 0)
++				break;
++		}
++	} else {
++		goto missing;
++	}
++
++	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
++			     bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset),
++			     0);
++	bp_k = bch2_btree_iter_peek_slot(&bp_iter);
++	ret = bkey_err(bp_k);
++	if (ret)
++		goto err;
++
++	if (bp_k.k->type != KEY_TYPE_backpointer ||
++	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp)))
++		goto missing;
++out:
++err:
++fsck_err:
++	bch2_trans_iter_exit(trans, &bp_iter);
++	bch2_trans_iter_exit(trans, &alloc_iter);
++	printbuf_exit(&buf);
++	return ret;
++missing:
++	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
++	       bch2_btree_ids[bp.btree_id], bp.level);
++	bch2_bkey_val_to_text(&buf, c, orig_k);
++	prt_printf(&buf, "\nin alloc key ");
++	bch2_bkey_val_to_text(&buf, c, alloc_k);
++
++	if (c->sb.version < bcachefs_metadata_version_backpointers ||
++	    c->opts.reconstruct_alloc ||
++	    fsck_err(c, "%s", buf.buf)) {
++		struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k);
++
++		ret   = PTR_ERR_OR_ZERO(a) ?:
++			bch2_bucket_backpointer_add(trans, a, bp, orig_k) ?:
++			bch2_trans_update(trans, &alloc_iter, &a->k_i, 0);
++	}
++
++	goto out;
++}
++
++static int check_extent_to_backpointers(struct btree_trans *trans,
++					struct btree_iter *iter)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_ptrs_c ptrs;
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++	struct bkey_s_c k;
++	int ret;
++
++	k = bch2_btree_iter_peek_all_levels(iter);
++	ret = bkey_err(k);
++	if (ret)
++		return ret;
++	if (!k.k)
++		return 0;
++
++	ptrs = bch2_bkey_ptrs_c(k);
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++		struct bpos bucket_pos;
++		struct bch_backpointer bp;
++
++		if (p.ptr.cached)
++			continue;
++
++		bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
++				      k, p, &bucket_pos, &bp);
++
++		ret = check_bp_exists(trans, bucket_pos, bp, k);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++static int check_btree_root_to_backpointers(struct btree_trans *trans,
++					    enum btree_id btree_id)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct btree *b;
++	struct bkey_s_c k;
++	struct bkey_ptrs_c ptrs;
++	struct extent_ptr_decoded p;
++	const union bch_extent_entry *entry;
++	int ret;
++
++	bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
++				  c->btree_roots[btree_id].level, 0);
++	b = bch2_btree_iter_peek_node(&iter);
++	ret = PTR_ERR_OR_ZERO(b);
++	if (ret)
++		goto err;
++
++	BUG_ON(b != btree_node_root(c, b));
++
++	k = bkey_i_to_s_c(&b->key);
++	ptrs = bch2_bkey_ptrs_c(k);
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++		struct bpos bucket_pos;
++		struct bch_backpointer bp;
++
++		if (p.ptr.cached)
++			continue;
++
++		bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
++				      k, p, &bucket_pos, &bp);
++
++		ret = check_bp_exists(trans, bucket_pos, bp, k);
++		if (ret)
++			goto err;
++	}
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_check_extents_to_backpointers(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	enum btree_id btree_id;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
++		bch2_trans_node_iter_init(&trans, &iter, btree_id, POS_MIN, 0,
++					  0,
++					  BTREE_ITER_ALL_LEVELS|
++					  BTREE_ITER_PREFETCH);
++
++		do {
++			ret = commit_do(&trans, NULL, NULL,
++					      BTREE_INSERT_LAZY_RW|
++					      BTREE_INSERT_NOFAIL,
++					      check_extent_to_backpointers(&trans, &iter));
++			if (ret)
++				break;
++		} while (!bch2_btree_iter_advance(&iter));
++
++		bch2_trans_iter_exit(&trans, &iter);
++
++		if (ret)
++			break;
++
++		ret = commit_do(&trans, NULL, NULL,
++				      BTREE_INSERT_LAZY_RW|
++				      BTREE_INSERT_NOFAIL,
++				      check_btree_root_to_backpointers(&trans, btree_id));
++		if (ret)
++			break;
++	}
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int check_one_backpointer(struct btree_trans *trans,
++				 struct bpos bucket,
++				 u64 *bp_offset)
++{
++	struct btree_iter iter;
++	struct bch_backpointer bp;
++	struct bkey_s_c k;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	ret = bch2_get_next_backpointer(trans, bucket, -1,
++					bp_offset, &bp);
++	if (ret || *bp_offset == U64_MAX)
++		return ret;
++
++	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
++	ret = bkey_err(k);
++	if (ret)
++		return ret;
++
++	if (fsck_err_on(!k.k, trans->c,
++			"%s backpointer points to missing extent\n%s",
++			*bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree",
++			(bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
++		ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
++		if (ret == -ENOENT)
++			bch_err(trans->c, "backpointer at %llu not found", *bp_offset);
++	}
++
++	bch2_trans_iter_exit(trans, &iter);
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_check_backpointers_to_extents(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		u64 bp_offset = 0;
++
++		while (!(ret = commit_do(&trans, NULL, NULL,
++					       BTREE_INSERT_LAZY_RW|
++					       BTREE_INSERT_NOFAIL,
++				check_one_backpointer(&trans, iter.pos, &bp_offset))) &&
++		       bp_offset < U64_MAX)
++			bp_offset++;
++
++		if (ret)
++			break;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret < 0 ? ret : 0;
++}
+diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
+new file mode 100644
+index 000000000000..fe42af296e9c
+--- /dev/null
++++ b/fs/bcachefs/backpointers.h
+@@ -0,0 +1,38 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
++#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
++
++#include "super.h"
++
++int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
++			     int, struct printbuf *);
++void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
++void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++void bch2_backpointer_swab(struct bkey_s);
++
++#define bch2_bkey_ops_backpointer (struct bkey_ops) {	\
++	.key_invalid	= bch2_backpointer_invalid,	\
++	.val_to_text	= bch2_backpointer_k_to_text,	\
++	.swab		= bch2_backpointer_swab,	\
++}
++
++void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned,
++			   struct bkey_s_c, struct extent_ptr_decoded,
++			   struct bpos *, struct bch_backpointer *);
++
++int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *,
++				struct bch_backpointer, struct bkey_s_c);
++int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *,
++				struct bch_backpointer, struct bkey_s_c);
++int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
++			      u64 *, struct bch_backpointer *);
++struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
++					 struct bpos, u64, struct bch_backpointer);
++struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
++					struct bpos, u64, struct bch_backpointer);
++
++int bch2_check_btree_backpointers(struct bch_fs *);
++int bch2_check_extents_to_backpointers(struct bch_fs *);
++int bch2_check_backpointers_to_extents(struct bch_fs *);
++
++#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
+diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
+new file mode 100644
+index 000000000000..8ffdb4dee47a
+--- /dev/null
++++ b/fs/bcachefs/bcachefs.h
+@@ -0,0 +1,1000 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_H
++#define _BCACHEFS_H
++
++/*
++ * SOME HIGH LEVEL CODE DOCUMENTATION:
++ *
++ * Bcache mostly works with cache sets, cache devices, and backing devices.
++ *
++ * Support for multiple cache devices hasn't quite been finished off yet, but
++ * it's about 95% plumbed through. A cache set and its cache devices is sort of
++ * like a md raid array and its component devices. Most of the code doesn't care
++ * about individual cache devices, the main abstraction is the cache set.
++ *
++ * Multiple cache devices is intended to give us the ability to mirror dirty
++ * cached data and metadata, without mirroring clean cached data.
++ *
++ * Backing devices are different, in that they have a lifetime independent of a
++ * cache set. When you register a newly formatted backing device it'll come up
++ * in passthrough mode, and then you can attach and detach a backing device from
++ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
++ * invalidates any cached data for that backing device.
++ *
++ * A cache set can have multiple (many) backing devices attached to it.
++ *
++ * There's also flash only volumes - this is the reason for the distinction
++ * between struct cached_dev and struct bcache_device. A flash only volume
++ * works much like a bcache device that has a backing device, except the
++ * "cached" data is always dirty. The end result is that we get thin
++ * provisioning with very little additional code.
++ *
++ * Flash only volumes work but they're not production ready because the moving
++ * garbage collector needs more work. More on that later.
++ *
++ * BUCKETS/ALLOCATION:
++ *
++ * Bcache is primarily designed for caching, which means that in normal
++ * operation all of our available space will be allocated. Thus, we need an
++ * efficient way of deleting things from the cache so we can write new things to
++ * it.
++ *
++ * To do this, we first divide the cache device up into buckets. A bucket is the
++ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
++ * works efficiently.
++ *
++ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
++ * it. The gens and priorities for all the buckets are stored contiguously and
++ * packed on disk (in a linked list of buckets - aside from the superblock, all
++ * of bcache's metadata is stored in buckets).
++ *
++ * The priority is used to implement an LRU. We reset a bucket's priority when
++ * we allocate it or on cache it, and every so often we decrement the priority
++ * of each bucket. It could be used to implement something more sophisticated,
++ * if anyone ever gets around to it.
++ *
++ * The generation is used for invalidating buckets. Each pointer also has an 8
++ * bit generation embedded in it; for a pointer to be considered valid, its gen
++ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
++ * we have to do is increment its gen (and write its new gen to disk; we batch
++ * this up).
++ *
++ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
++ * contain metadata (including btree nodes).
++ *
++ * THE BTREE:
++ *
++ * Bcache is in large part design around the btree.
++ *
++ * At a high level, the btree is just an index of key -> ptr tuples.
++ *
++ * Keys represent extents, and thus have a size field. Keys also have a variable
++ * number of pointers attached to them (potentially zero, which is handy for
++ * invalidating the cache).
++ *
++ * The key itself is an inode:offset pair. The inode number corresponds to a
++ * backing device or a flash only volume. The offset is the ending offset of the
++ * extent within the inode - not the starting offset; this makes lookups
++ * slightly more convenient.
++ *
++ * Pointers contain the cache device id, the offset on that device, and an 8 bit
++ * generation number. More on the gen later.
++ *
++ * Index lookups are not fully abstracted - cache lookups in particular are
++ * still somewhat mixed in with the btree code, but things are headed in that
++ * direction.
++ *
++ * Updates are fairly well abstracted, though. There are two different ways of
++ * updating the btree; insert and replace.
++ *
++ * BTREE_INSERT will just take a list of keys and insert them into the btree -
++ * overwriting (possibly only partially) any extents they overlap with. This is
++ * used to update the index after a write.
++ *
++ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
++ * overwriting a key that matches another given key. This is used for inserting
++ * data into the cache after a cache miss, and for background writeback, and for
++ * the moving garbage collector.
++ *
++ * There is no "delete" operation; deleting things from the index is
++ * accomplished by either by invalidating pointers (by incrementing a bucket's
++ * gen) or by inserting a key with 0 pointers - which will overwrite anything
++ * previously present at that location in the index.
++ *
++ * This means that there are always stale/invalid keys in the btree. They're
++ * filtered out by the code that iterates through a btree node, and removed when
++ * a btree node is rewritten.
++ *
++ * BTREE NODES:
++ *
++ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
++ * free smaller than a bucket - so, that's how big our btree nodes are.
++ *
++ * (If buckets are really big we'll only use part of the bucket for a btree node
++ * - no less than 1/4th - but a bucket still contains no more than a single
++ * btree node. I'd actually like to change this, but for now we rely on the
++ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
++ *
++ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
++ * btree implementation.
++ *
++ * The way this is solved is that btree nodes are internally log structured; we
++ * can append new keys to an existing btree node without rewriting it. This
++ * means each set of keys we write is sorted, but the node is not.
++ *
++ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
++ * be expensive, and we have to distinguish between the keys we have written and
++ * the keys we haven't. So to do a lookup in a btree node, we have to search
++ * each sorted set. But we do merge written sets together lazily, so the cost of
++ * these extra searches is quite low (normally most of the keys in a btree node
++ * will be in one big set, and then there'll be one or two sets that are much
++ * smaller).
++ *
++ * This log structure makes bcache's btree more of a hybrid between a
++ * conventional btree and a compacting data structure, with some of the
++ * advantages of both.
++ *
++ * GARBAGE COLLECTION:
++ *
++ * We can't just invalidate any bucket - it might contain dirty data or
++ * metadata. If it once contained dirty data, other writes might overwrite it
++ * later, leaving no valid pointers into that bucket in the index.
++ *
++ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
++ * It also counts how much valid data it each bucket currently contains, so that
++ * allocation can reuse buckets sooner when they've been mostly overwritten.
++ *
++ * It also does some things that are really internal to the btree
++ * implementation. If a btree node contains pointers that are stale by more than
++ * some threshold, it rewrites the btree node to avoid the bucket's generation
++ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
++ *
++ * THE JOURNAL:
++ *
++ * Bcache's journal is not necessary for consistency; we always strictly
++ * order metadata writes so that the btree and everything else is consistent on
++ * disk in the event of an unclean shutdown, and in fact bcache had writeback
++ * caching (with recovery from unclean shutdown) before journalling was
++ * implemented.
++ *
++ * Rather, the journal is purely a performance optimization; we can't complete a
++ * write until we've updated the index on disk, otherwise the cache would be
++ * inconsistent in the event of an unclean shutdown. This means that without the
++ * journal, on random write workloads we constantly have to update all the leaf
++ * nodes in the btree, and those writes will be mostly empty (appending at most
++ * a few keys each) - highly inefficient in terms of amount of metadata writes,
++ * and it puts more strain on the various btree resorting/compacting code.
++ *
++ * The journal is just a log of keys we've inserted; on startup we just reinsert
++ * all the keys in the open journal entries. That means that when we're updating
++ * a node in the btree, we can wait until a 4k block of keys fills up before
++ * writing them out.
++ *
++ * For simplicity, we only journal updates to leaf nodes; updates to parent
++ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
++ * the complexity to deal with journalling them (in particular, journal replay)
++ * - updates to non leaf nodes just happen synchronously (see btree_split()).
++ */
++
++#undef pr_fmt
++#ifdef __KERNEL__
++#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
++#else
++#define pr_fmt(fmt) "%s() " fmt "\n", __func__
++#endif
++
++#include <linux/backing-dev-defs.h>
++#include <linux/bug.h>
++#include <linux/bio.h>
++#include <linux/closure.h>
++#include <linux/kobject.h>
++#include <linux/list.h>
++#include <linux/math64.h>
++#include <linux/mutex.h>
++#include <linux/percpu-refcount.h>
++#include <linux/percpu-rwsem.h>
++#include <linux/rhashtable.h>
++#include <linux/rwsem.h>
++#include <linux/semaphore.h>
++#include <linux/seqlock.h>
++#include <linux/shrinker.h>
++#include <linux/srcu.h>
++#include <linux/types.h>
++#include <linux/workqueue.h>
++#include <linux/zstd.h>
++
++#include "bcachefs_format.h"
++#include "errcode.h"
++#include "fifo.h"
++#include "opts.h"
++#include "util.h"
++
++#define dynamic_fault(...)		0
++#define race_fault(...)			0
++
++#define bch2_fs_init_fault(name)					\
++	dynamic_fault("bcachefs:bch_fs_init:" name)
++#define bch2_meta_read_fault(name)					\
++	 dynamic_fault("bcachefs:meta:read:" name)
++#define bch2_meta_write_fault(name)					\
++	 dynamic_fault("bcachefs:meta:write:" name)
++
++#ifdef __KERNEL__
++#define bch2_fmt(_c, fmt)		"bcachefs (%s): " fmt "\n", ((_c)->name)
++#define bch2_fmt_inum(_c, _inum, fmt)	"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
++#else
++#define bch2_fmt(_c, fmt)		fmt "\n"
++#define bch2_fmt_inum(_c, _inum, fmt)	"inum %llu: " fmt "\n", (_inum)
++#endif
++
++#define bch_info(c, fmt, ...) \
++	printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
++#define bch_notice(c, fmt, ...) \
++	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
++#define bch_warn(c, fmt, ...) \
++	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
++#define bch_warn_ratelimited(c, fmt, ...) \
++	printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
++#define bch_err(c, fmt, ...) \
++	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
++
++#define bch_err_ratelimited(c, fmt, ...) \
++	printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
++#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
++	printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
++
++#define bch_verbose(c, fmt, ...)					\
++do {									\
++	if ((c)->opts.verbose)						\
++		bch_info(c, fmt, ##__VA_ARGS__);			\
++} while (0)
++
++#define pr_verbose_init(opts, fmt, ...)					\
++do {									\
++	if (opt_get(opts, verbose))					\
++		pr_info(fmt, ##__VA_ARGS__);				\
++} while (0)
++
++/* Parameters that are useful for debugging, but should always be compiled in: */
++#define BCH_DEBUG_PARAMS_ALWAYS()					\
++	BCH_DEBUG_PARAM(key_merging_disabled,				\
++		"Disables merging of extents")				\
++	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
++		"Causes mark and sweep to compact and rewrite every "	\
++		"btree node it traverses")				\
++	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
++		"Disables rewriting of btree nodes during mark and sweep")\
++	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
++		"Disables the shrinker callback for the btree node cache")\
++	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
++		"Reread btree nodes at various points to verify the "	\
++		"mergesort in the read path against modifications "	\
++		"done in memory")					\
++	BCH_DEBUG_PARAM(verify_all_btree_replicas,			\
++		"When reading btree nodes, read all replicas and "	\
++		"compare them")
++
++/* Parameters that should only be compiled in in debug mode: */
++#define BCH_DEBUG_PARAMS_DEBUG()					\
++	BCH_DEBUG_PARAM(expensive_debug_checks,				\
++		"Enables various runtime debugging checks that "	\
++		"significantly affect performance")			\
++	BCH_DEBUG_PARAM(debug_check_iterators,				\
++		"Enables extra verification for btree iterators")	\
++	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
++		"Verify btree accounting for keys within a node")	\
++	BCH_DEBUG_PARAM(journal_seq_verify,				\
++		"Store the journal sequence number in the version "	\
++		"number of every btree key, and verify that btree "	\
++		"update ordering is preserved during recovery")		\
++	BCH_DEBUG_PARAM(inject_invalid_keys,				\
++		"Store the journal sequence number in the version "	\
++		"number of every btree key, and verify that btree "	\
++		"update ordering is preserved during recovery")		\
++	BCH_DEBUG_PARAM(test_alloc_startup,				\
++		"Force allocator startup to use the slowpath where it"	\
++		"can't find enough free buckets without invalidating"	\
++		"cached data")						\
++	BCH_DEBUG_PARAM(force_reconstruct_read,				\
++		"Force reads to use the reconstruct path, when reading"	\
++		"from erasure coded extents")				\
++	BCH_DEBUG_PARAM(test_restart_gc,				\
++		"Test restarting mark and sweep gc when bucket gens change")
++
++#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
++#else
++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
++#endif
++
++#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
++BCH_DEBUG_PARAMS()
++#undef BCH_DEBUG_PARAM
++
++#ifndef CONFIG_BCACHEFS_DEBUG
++#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
++BCH_DEBUG_PARAMS_DEBUG()
++#undef BCH_DEBUG_PARAM
++#endif
++
++#define BCH_LOCK_TIME_NR 128
++
++#define BCH_TIME_STATS()			\
++	x(btree_node_mem_alloc)			\
++	x(btree_node_split)			\
++	x(btree_node_compact)			\
++	x(btree_node_merge)			\
++	x(btree_node_sort)			\
++	x(btree_node_read)			\
++	x(btree_interior_update_foreground)	\
++	x(btree_interior_update_total)		\
++	x(btree_gc)				\
++	x(btree_lock_contended_read)		\
++	x(btree_lock_contended_intent)		\
++	x(btree_lock_contended_write)		\
++	x(data_write)				\
++	x(data_read)				\
++	x(data_promote)				\
++	x(journal_flush_write)			\
++	x(journal_noflush_write)		\
++	x(journal_flush_seq)			\
++	x(blocked_journal)			\
++	x(blocked_allocate)			\
++	x(blocked_allocate_open_bucket)
++
++enum bch_time_stats {
++#define x(name) BCH_TIME_##name,
++	BCH_TIME_STATS()
++#undef x
++	BCH_TIME_STAT_NR
++};
++
++#include "alloc_types.h"
++#include "btree_types.h"
++#include "buckets_types.h"
++#include "buckets_waiting_for_journal_types.h"
++#include "clock_types.h"
++#include "ec_types.h"
++#include "journal_types.h"
++#include "keylist_types.h"
++#include "quota_types.h"
++#include "rebalance_types.h"
++#include "replicas_types.h"
++#include "subvolume_types.h"
++#include "super_types.h"
++
++/* Number of nodes btree coalesce will try to coalesce at once */
++#define GC_MERGE_NODES		4U
++
++/* Maximum number of nodes we might need to allocate atomically: */
++#define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
++
++/* Size of the freelist we allocate btree nodes from: */
++#define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
++
++#define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
++
++struct btree;
++
++enum gc_phase {
++	GC_PHASE_NOT_RUNNING,
++	GC_PHASE_START,
++	GC_PHASE_SB,
++
++	GC_PHASE_BTREE_stripes,
++	GC_PHASE_BTREE_extents,
++	GC_PHASE_BTREE_inodes,
++	GC_PHASE_BTREE_dirents,
++	GC_PHASE_BTREE_xattrs,
++	GC_PHASE_BTREE_alloc,
++	GC_PHASE_BTREE_quotas,
++	GC_PHASE_BTREE_reflink,
++	GC_PHASE_BTREE_subvolumes,
++	GC_PHASE_BTREE_snapshots,
++	GC_PHASE_BTREE_lru,
++	GC_PHASE_BTREE_freespace,
++	GC_PHASE_BTREE_need_discard,
++	GC_PHASE_BTREE_backpointers,
++
++	GC_PHASE_PENDING_DELETE,
++};
++
++struct gc_pos {
++	enum gc_phase		phase;
++	struct bpos		pos;
++	unsigned		level;
++};
++
++struct reflink_gc {
++	u64		offset;
++	u32		size;
++	u32		refcount;
++};
++
++typedef GENRADIX(struct reflink_gc) reflink_gc_table;
++
++struct io_count {
++	u64			sectors[2][BCH_DATA_NR];
++};
++
++struct bch_dev {
++	struct kobject		kobj;
++	struct percpu_ref	ref;
++	struct completion	ref_completion;
++	struct percpu_ref	io_ref;
++	struct completion	io_ref_completion;
++
++	struct bch_fs		*fs;
++
++	u8			dev_idx;
++	/*
++	 * Cached version of this device's member info from superblock
++	 * Committed by bch2_write_super() -> bch_fs_mi_update()
++	 */
++	struct bch_member_cpu	mi;
++	uuid_le			uuid;
++	char			name[BDEVNAME_SIZE];
++
++	struct bch_sb_handle	disk_sb;
++	struct bch_sb		*sb_read_scratch;
++	int			sb_write_error;
++	dev_t			dev;
++
++	struct bch_devs_mask	self;
++
++	/* biosets used in cloned bios for writing multiple replicas */
++	struct bio_set		replica_set;
++
++	/*
++	 * Buckets:
++	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
++	 * gc_lock, for device resize - holding any is sufficient for access:
++	 * Or rcu_read_lock(), but only for ptr_stale():
++	 */
++	struct bucket_array __rcu *buckets_gc;
++	struct bucket_gens __rcu *bucket_gens;
++	u8			*oldest_gen;
++	unsigned long		*buckets_nouse;
++	struct rw_semaphore	bucket_lock;
++
++	struct bch_dev_usage		*usage_base;
++	struct bch_dev_usage __percpu	*usage[JOURNAL_BUF_NR];
++	struct bch_dev_usage __percpu	*usage_gc;
++
++	/* Allocator: */
++	u64			new_fs_bucket_idx;
++	u64			bucket_alloc_trans_early_cursor;
++
++	unsigned		nr_open_buckets;
++	unsigned		nr_btree_reserve;
++
++	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
++	open_bucket_idx_t	open_buckets_partial_nr;
++
++	size_t			inc_gen_needs_gc;
++	size_t			inc_gen_really_needs_gc;
++	size_t			buckets_waiting_on_journal;
++
++	atomic64_t		rebalance_work;
++
++	struct journal_device	journal;
++	u64			prev_journal_sector;
++
++	struct work_struct	io_error_work;
++
++	/* The rest of this all shows up in sysfs */
++	atomic64_t		cur_latency[2];
++	struct time_stats	io_latency[2];
++
++#define CONGESTED_MAX		1024
++	atomic_t		congested;
++	u64			congested_last;
++
++	struct io_count __percpu *io_done;
++};
++
++enum {
++	/* startup: */
++	BCH_FS_STARTED,
++	BCH_FS_MAY_GO_RW,
++	BCH_FS_RW,
++	BCH_FS_WAS_RW,
++
++	/* shutdown: */
++	BCH_FS_STOPPING,
++	BCH_FS_EMERGENCY_RO,
++	BCH_FS_WRITE_DISABLE_COMPLETE,
++	BCH_FS_CLEAN_SHUTDOWN,
++
++	/* fsck passes: */
++	BCH_FS_TOPOLOGY_REPAIR_DONE,
++	BCH_FS_INITIAL_GC_DONE,		/* kill when we enumerate fsck passes */
++	BCH_FS_CHECK_LRUS_DONE,
++	BCH_FS_CHECK_BACKPOINTERS_DONE,
++	BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
++	BCH_FS_FSCK_DONE,
++	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
++	BCH_FS_NEED_ANOTHER_GC,
++
++	BCH_FS_HAVE_DELETED_SNAPSHOTS,
++
++	/* errors: */
++	BCH_FS_ERROR,
++	BCH_FS_TOPOLOGY_ERROR,
++	BCH_FS_ERRORS_FIXED,
++	BCH_FS_ERRORS_NOT_FIXED,
++};
++
++struct btree_debug {
++	unsigned		id;
++};
++
++struct lock_held_stats {
++	struct time_stats       times[BCH_LOCK_TIME_NR];
++	const char              *names[BCH_LOCK_TIME_NR];
++};
++
++struct bch_fs_pcpu {
++	u64			sectors_available;
++};
++
++struct journal_seq_blacklist_table {
++	size_t			nr;
++	struct journal_seq_blacklist_table_entry {
++		u64		start;
++		u64		end;
++		bool		dirty;
++	}			entries[0];
++};
++
++struct journal_keys {
++	struct journal_key {
++		enum btree_id	btree_id:8;
++		unsigned	level:8;
++		bool		allocated;
++		bool		overwritten;
++		struct bkey_i	*k;
++		u32		journal_seq;
++		u32		journal_offset;
++	}			*d;
++	/*
++	 * Gap buffer: instead of all the empty space in the array being at the
++	 * end of the buffer - from @nr to @size - the empty space is at @gap.
++	 * This means that sequential insertions are O(n) instead of O(n^2).
++	 */
++	size_t			gap;
++	size_t			nr;
++	size_t			size;
++	u64			journal_seq_base;
++};
++
++struct btree_path_buf {
++	struct btree_path	*path;
++};
++
++#define REPLICAS_DELTA_LIST_MAX	(1U << 16)
++
++struct snapshot_t {
++	u32			parent;
++	u32			children[2];
++	u32			subvol; /* Nonzero only if a subvolume points to this node: */
++	u32			equiv;
++};
++
++typedef struct {
++	u32		subvol;
++	u64		inum;
++} subvol_inum;
++
++#define BCACHEFS_ROOT_SUBVOL_INUM					\
++	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
++
++struct bch_fs {
++	struct closure		cl;
++
++	struct list_head	list;
++	struct kobject		kobj;
++	struct kobject		counters_kobj;
++	struct kobject		internal;
++	struct kobject		opts_dir;
++	struct kobject		time_stats;
++	unsigned long		flags;
++
++	int			minor;
++	struct device		*chardev;
++	struct super_block	*vfs_sb;
++	dev_t			dev;
++	char			name[40];
++
++	/* ro/rw, add/remove/resize devices: */
++	struct rw_semaphore	state_lock;
++
++	/* Counts outstanding writes, for clean transition to read-only */
++	struct percpu_ref	writes;
++	struct work_struct	read_only_work;
++
++	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
++
++	struct bch_replicas_cpu replicas;
++	struct bch_replicas_cpu replicas_gc;
++	struct mutex		replicas_gc_lock;
++	mempool_t		replicas_delta_pool;
++
++	struct journal_entry_res btree_root_journal_res;
++	struct journal_entry_res replicas_journal_res;
++	struct journal_entry_res clock_journal_res;
++	struct journal_entry_res dev_usage_journal_res;
++
++	struct bch_disk_groups_cpu __rcu *disk_groups;
++
++	struct bch_opts		opts;
++
++	/* Updated by bch2_sb_update():*/
++	struct {
++		uuid_le		uuid;
++		uuid_le		user_uuid;
++
++		u16		version;
++		u16		version_min;
++
++		u8		nr_devices;
++		u8		clean;
++
++		u8		encryption_type;
++
++		u64		time_base_lo;
++		u32		time_base_hi;
++		unsigned	time_units_per_sec;
++		unsigned	nsec_per_time_unit;
++		u64		features;
++		u64		compat;
++	}			sb;
++
++
++	struct bch_sb_handle	disk_sb;
++
++	unsigned short		block_bits;	/* ilog2(block_size) */
++
++	u16			btree_foreground_merge_threshold;
++
++	struct closure		sb_write;
++	struct mutex		sb_lock;
++
++	/* snapshot.c: */
++	GENRADIX(struct snapshot_t) snapshots;
++	struct bch_snapshot_table __rcu *snapshot_table;
++	struct mutex		snapshot_table_lock;
++	struct work_struct	snapshot_delete_work;
++	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
++	snapshot_id_list	snapshots_unlinked;
++	struct mutex		snapshots_unlinked_lock;
++
++	/* BTREE CACHE */
++	struct bio_set		btree_bio;
++	struct workqueue_struct	*io_complete_wq;
++
++	struct btree_root	btree_roots[BTREE_ID_NR];
++	struct mutex		btree_root_lock;
++
++	struct btree_cache	btree_cache;
++
++	/*
++	 * Cache of allocated btree nodes - if we allocate a btree node and
++	 * don't use it, if we free it that space can't be reused until going
++	 * _all_ the way through the allocator (which exposes us to a livelock
++	 * when allocating btree reserves fail halfway through) - instead, we
++	 * can stick them here:
++	 */
++	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
++	unsigned		btree_reserve_cache_nr;
++	struct mutex		btree_reserve_cache_lock;
++
++	mempool_t		btree_interior_update_pool;
++	struct list_head	btree_interior_update_list;
++	struct list_head	btree_interior_updates_unwritten;
++	struct mutex		btree_interior_update_lock;
++	struct closure_waitlist	btree_interior_update_wait;
++
++	struct workqueue_struct	*btree_interior_update_worker;
++	struct work_struct	btree_interior_update_work;
++
++	/* btree_iter.c: */
++	struct mutex		btree_trans_lock;
++	struct list_head	btree_trans_list;
++	mempool_t		btree_paths_pool;
++	mempool_t		btree_trans_mem_pool;
++	struct btree_path_buf  __percpu	*btree_paths_bufs;
++
++	struct srcu_struct	btree_trans_barrier;
++	bool			btree_trans_barrier_initialized;
++
++	struct btree_key_cache	btree_key_cache;
++	unsigned		btree_key_cache_btrees;
++
++	struct workqueue_struct	*btree_update_wq;
++	struct workqueue_struct	*btree_io_complete_wq;
++	/* copygc needs its own workqueue for index updates.. */
++	struct workqueue_struct	*copygc_wq;
++
++	/* ALLOCATION */
++	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
++
++	u64			capacity; /* sectors */
++
++	/*
++	 * When capacity _decreases_ (due to a disk being removed), we
++	 * increment capacity_gen - this invalidates outstanding reservations
++	 * and forces them to be revalidated
++	 */
++	u32			capacity_gen;
++	unsigned		bucket_size_max;
++
++	atomic64_t		sectors_available;
++	struct mutex		sectors_available_lock;
++
++	struct bch_fs_pcpu __percpu	*pcpu;
++
++	struct percpu_rw_semaphore	mark_lock;
++
++	seqcount_t			usage_lock;
++	struct bch_fs_usage		*usage_base;
++	struct bch_fs_usage __percpu	*usage[JOURNAL_BUF_NR];
++	struct bch_fs_usage __percpu	*usage_gc;
++	u64 __percpu		*online_reserved;
++
++	/* single element mempool: */
++	struct mutex		usage_scratch_lock;
++	struct bch_fs_usage_online *usage_scratch;
++
++	struct io_clock		io_clock[2];
++
++	/* JOURNAL SEQ BLACKLIST */
++	struct journal_seq_blacklist_table *
++				journal_seq_blacklist_table;
++	struct work_struct	journal_seq_blacklist_gc_work;
++
++	/* ALLOCATOR */
++	spinlock_t		freelist_lock;
++	struct closure_waitlist	freelist_wait;
++	u64			blocked_allocate;
++	u64			blocked_allocate_open_bucket;
++
++	open_bucket_idx_t	open_buckets_freelist;
++	open_bucket_idx_t	open_buckets_nr_free;
++	struct closure_waitlist	open_buckets_wait;
++	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
++	open_bucket_idx_t	open_buckets_hash[OPEN_BUCKETS_COUNT];
++
++	struct write_point	btree_write_point;
++	struct write_point	rebalance_write_point;
++
++	struct write_point	write_points[WRITE_POINT_MAX];
++	struct hlist_head	write_points_hash[WRITE_POINT_HASH_NR];
++	struct mutex		write_points_hash_lock;
++	unsigned		write_points_nr;
++
++	struct buckets_waiting_for_journal buckets_waiting_for_journal;
++	struct work_struct	discard_work;
++	struct work_struct	invalidate_work;
++
++	/* GARBAGE COLLECTION */
++	struct task_struct	*gc_thread;
++	atomic_t		kick_gc;
++	unsigned long		gc_count;
++
++	enum btree_id		gc_gens_btree;
++	struct bpos		gc_gens_pos;
++
++	/*
++	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
++	 * has been marked by GC.
++	 *
++	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
++	 *
++	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
++	 * can read without a lock.
++	 */
++	seqcount_t		gc_pos_lock;
++	struct gc_pos		gc_pos;
++
++	/*
++	 * The allocation code needs gc_mark in struct bucket to be correct, but
++	 * it's not while a gc is in progress.
++	 */
++	struct rw_semaphore	gc_lock;
++	struct mutex		gc_gens_lock;
++
++	/* IO PATH */
++	struct semaphore	io_in_flight;
++	struct bio_set		bio_read;
++	struct bio_set		bio_read_split;
++	struct bio_set		bio_write;
++	struct mutex		bio_bounce_pages_lock;
++	mempool_t		bio_bounce_pages;
++	struct rhashtable	promote_table;
++
++	mempool_t		compression_bounce[2];
++	mempool_t		compress_workspace[BCH_COMPRESSION_TYPE_NR];
++	mempool_t		decompress_workspace;
++	ZSTD_parameters		zstd_params;
++
++	struct crypto_shash	*sha256;
++	struct crypto_sync_skcipher *chacha20;
++	struct crypto_shash	*poly1305;
++
++	atomic64_t		key_version;
++
++	mempool_t		large_bkey_pool;
++
++	/* REBALANCE */
++	struct bch_fs_rebalance	rebalance;
++
++	/* COPYGC */
++	struct task_struct	*copygc_thread;
++	copygc_heap		copygc_heap;
++	struct write_point	copygc_write_point;
++	s64			copygc_wait;
++	bool			copygc_running;
++	wait_queue_head_t	copygc_running_wq;
++
++	/* DATA PROGRESS STATS */
++	struct list_head	data_progress_list;
++	struct mutex		data_progress_lock;
++
++	/* STRIPES: */
++	GENRADIX(struct stripe) stripes;
++	GENRADIX(struct gc_stripe) gc_stripes;
++
++	ec_stripes_heap		ec_stripes_heap;
++	spinlock_t		ec_stripes_heap_lock;
++
++	/* ERASURE CODING */
++	struct list_head	ec_stripe_head_list;
++	struct mutex		ec_stripe_head_lock;
++
++	struct list_head	ec_stripe_new_list;
++	struct mutex		ec_stripe_new_lock;
++
++	struct work_struct	ec_stripe_create_work;
++	u64			ec_stripe_hint;
++
++	struct bio_set		ec_bioset;
++
++	struct work_struct	ec_stripe_delete_work;
++	struct llist_head	ec_stripe_delete_list;
++
++	/* REFLINK */
++	u64			reflink_hint;
++	reflink_gc_table	reflink_gc_table;
++	size_t			reflink_gc_nr;
++
++	/* VFS IO PATH - fs-io.c */
++	struct bio_set		writepage_bioset;
++	struct bio_set		dio_write_bioset;
++	struct bio_set		dio_read_bioset;
++
++
++	atomic64_t		btree_writes_nr;
++	atomic64_t		btree_writes_sectors;
++	spinlock_t		btree_write_error_lock;
++
++	/* ERRORS */
++	struct list_head	fsck_errors;
++	struct mutex		fsck_error_lock;
++	bool			fsck_alloc_err;
++
++	/* QUOTAS */
++	struct bch_memquota_type quotas[QTYP_NR];
++
++	/* DEBUG JUNK */
++	struct dentry		*fs_debug_dir;
++	struct dentry		*btree_debug_dir;
++	struct btree_debug	btree_debug[BTREE_ID_NR];
++	struct btree		*verify_data;
++	struct btree_node	*verify_ondisk;
++	struct mutex		verify_lock;
++
++	u64			*unused_inode_hints;
++	unsigned		inode_shard_bits;
++
++	/*
++	 * A btree node on disk could have too many bsets for an iterator to fit
++	 * on the stack - have to dynamically allocate them
++	 */
++	mempool_t		fill_iter;
++
++	mempool_t		btree_bounce_pool;
++
++	struct journal		journal;
++	GENRADIX(struct journal_replay *) journal_entries;
++	u64			journal_entries_base_seq;
++	struct journal_keys	journal_keys;
++	struct list_head	journal_iters;
++
++	u64			last_bucket_seq_cleanup;
++
++	/* TODO rewrite as counters - The rest of this all shows up in sysfs */
++	atomic_long_t		read_realloc_races;
++	atomic_long_t		extent_migrate_done;
++	atomic_long_t		extent_migrate_raced;
++	atomic_long_t		bucket_alloc_fail;
++
++	u64			counters_on_mount[BCH_COUNTER_NR];
++	u64 __percpu		*counters;
++
++	unsigned		btree_gc_periodic:1;
++	unsigned		copy_gc_enabled:1;
++	bool			promote_whole_extents;
++
++	struct time_stats	times[BCH_TIME_STAT_NR];
++
++	struct lock_held_stats lock_held_stats;
++};
++
++static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
++{
++#ifndef NO_BCACHEFS_FS
++	if (c->vfs_sb)
++		c->vfs_sb->s_bdi->ra_pages = ra_pages;
++#endif
++}
++
++static inline unsigned bucket_bytes(const struct bch_dev *ca)
++{
++	return ca->mi.bucket_size << 9;
++}
++
++static inline unsigned block_bytes(const struct bch_fs *c)
++{
++	return c->opts.block_size;
++}
++
++static inline unsigned block_sectors(const struct bch_fs *c)
++{
++	return c->opts.block_size >> 9;
++}
++
++static inline size_t btree_sectors(const struct bch_fs *c)
++{
++	return c->opts.btree_node_size >> 9;
++}
++
++static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
++{
++	return c->btree_key_cache_btrees & (1U << btree);
++}
++
++static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
++{
++	struct timespec64 t;
++	s32 rem;
++
++	time += c->sb.time_base_lo;
++
++	t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
++	t.tv_nsec = rem * c->sb.nsec_per_time_unit;
++	return t;
++}
++
++static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
++{
++	return (ts.tv_sec * c->sb.time_units_per_sec +
++		(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
++}
++
++static inline s64 bch2_current_time(const struct bch_fs *c)
++{
++	struct timespec64 now;
++
++	ktime_get_coarse_real_ts64(&now);
++	return timespec_to_bch2_time(c, now);
++}
++
++static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
++{
++	return dev < c->sb.nr_devices && c->devs[dev];
++}
++
++#endif /* _BCACHEFS_H */
+diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
+new file mode 100644
+index 000000000000..147fde1417b0
+--- /dev/null
++++ b/fs/bcachefs/bcachefs_format.h
+@@ -0,0 +1,2052 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FORMAT_H
++#define _BCACHEFS_FORMAT_H
++
++/*
++ * bcachefs on disk data structures
++ *
++ * OVERVIEW:
++ *
++ * There are three main types of on disk data structures in bcachefs (this is
++ * reduced from 5 in bcache)
++ *
++ *  - superblock
++ *  - journal
++ *  - btree
++ *
++ * The btree is the primary structure; most metadata exists as keys in the
++ * various btrees. There are only a small number of btrees, they're not
++ * sharded - we have one btree for extents, another for inodes, et cetera.
++ *
++ * SUPERBLOCK:
++ *
++ * The superblock contains the location of the journal, the list of devices in
++ * the filesystem, and in general any metadata we need in order to decide
++ * whether we can start a filesystem or prior to reading the journal/btree
++ * roots.
++ *
++ * The superblock is extensible, and most of the contents of the superblock are
++ * in variable length, type tagged fields; see struct bch_sb_field.
++ *
++ * Backup superblocks do not reside in a fixed location; also, superblocks do
++ * not have a fixed size. To locate backup superblocks we have struct
++ * bch_sb_layout; we store a copy of this inside every superblock, and also
++ * before the first superblock.
++ *
++ * JOURNAL:
++ *
++ * The journal primarily records btree updates in the order they occurred;
++ * journal replay consists of just iterating over all the keys in the open
++ * journal entries and re-inserting them into the btrees.
++ *
++ * The journal also contains entry types for the btree roots, and blacklisted
++ * journal sequence numbers (see journal_seq_blacklist.c).
++ *
++ * BTREE:
++ *
++ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
++ * 128k-256k) and log structured. We use struct btree_node for writing the first
++ * entry in a given node (offset 0), and struct btree_node_entry for all
++ * subsequent writes.
++ *
++ * After the header, btree node entries contain a list of keys in sorted order.
++ * Values are stored inline with the keys; since values are variable length (and
++ * keys effectively are variable length too, due to packing) we can't do random
++ * access without building up additional in memory tables in the btree node read
++ * path.
++ *
++ * BTREE KEYS (struct bkey):
++ *
++ * The various btrees share a common format for the key - so as to avoid
++ * switching in fastpath lookup/comparison code - but define their own
++ * structures for the key values.
++ *
++ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
++ * size is just under 2k. The common part also contains a type tag for the
++ * value, and a format field indicating whether the key is packed or not (and
++ * also meant to allow adding new key fields in the future, if desired).
++ *
++ * bkeys, when stored within a btree node, may also be packed. In that case, the
++ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
++ * be generous with field sizes in the common part of the key format (64 bit
++ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
++ */
++
++#include <asm/types.h>
++#include <asm/byteorder.h>
++#include <linux/kernel.h>
++#include <linux/uuid.h>
++#include "vstructs.h"
++
++#define BITMASK(name, type, field, offset, end)				\
++static const unsigned	name##_OFFSET = offset;				\
++static const unsigned	name##_BITS = (end - offset);			\
++									\
++static inline __u64 name(const type *k)					\
++{									\
++	return (k->field >> offset) & ~(~0ULL << (end - offset));	\
++}									\
++									\
++static inline void SET_##name(type *k, __u64 v)				\
++{									\
++	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
++	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
++}
++
++#define LE_BITMASK(_bits, name, type, field, offset, end)		\
++static const unsigned	name##_OFFSET = offset;				\
++static const unsigned	name##_BITS = (end - offset);			\
++static const __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;	\
++									\
++static inline __u64 name(const type *k)					\
++{									\
++	return (__le##_bits##_to_cpu(k->field) >> offset) &		\
++		~(~0ULL << (end - offset));				\
++}									\
++									\
++static inline void SET_##name(type *k, __u64 v)				\
++{									\
++	__u##_bits new = __le##_bits##_to_cpu(k->field);		\
++									\
++	new &= ~(~(~0ULL << (end - offset)) << offset);			\
++	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
++	k->field = __cpu_to_le##_bits(new);				\
++}
++
++#define LE16_BITMASK(n, t, f, o, e)	LE_BITMASK(16, n, t, f, o, e)
++#define LE32_BITMASK(n, t, f, o, e)	LE_BITMASK(32, n, t, f, o, e)
++#define LE64_BITMASK(n, t, f, o, e)	LE_BITMASK(64, n, t, f, o, e)
++
++struct bkey_format {
++	__u8		key_u64s;
++	__u8		nr_fields;
++	/* One unused slot for now: */
++	__u8		bits_per_field[6];
++	__le64		field_offset[6];
++};
++
++/* Btree keys - all units are in sectors */
++
++struct bpos {
++	/*
++	 * Word order matches machine byte order - btree code treats a bpos as a
++	 * single large integer, for search/comparison purposes
++	 *
++	 * Note that wherever a bpos is embedded in another on disk data
++	 * structure, it has to be byte swabbed when reading in metadata that
++	 * wasn't written in native endian order:
++	 */
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++	__u32		snapshot;
++	__u64		offset;
++	__u64		inode;
++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++	__u64		inode;
++	__u64		offset;		/* Points to end of extent - sectors */
++	__u32		snapshot;
++#else
++#error edit for your odd byteorder.
++#endif
++} __attribute__((packed, aligned(4)));
++
++#define KEY_INODE_MAX			((__u64)~0ULL)
++#define KEY_OFFSET_MAX			((__u64)~0ULL)
++#define KEY_SNAPSHOT_MAX		((__u32)~0U)
++#define KEY_SIZE_MAX			((__u32)~0U)
++
++static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
++{
++	return (struct bpos) {
++		.inode		= inode,
++		.offset		= offset,
++		.snapshot	= snapshot,
++	};
++}
++
++#define POS_MIN				SPOS(0, 0, 0)
++#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
++#define SPOS_MAX			SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
++#define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
++
++/* Empty placeholder struct, for container_of() */
++struct bch_val {
++	__u64		__nothing[0];
++};
++
++struct bversion {
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++	__u64		lo;
++	__u32		hi;
++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++	__u32		hi;
++	__u64		lo;
++#endif
++} __attribute__((packed, aligned(4)));
++
++struct bkey {
++	/* Size of combined key and value, in u64s */
++	__u8		u64s;
++
++	/* Format of key (0 for format local to btree node) */
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	__u8		format:7,
++			needs_whiteout:1;
++#elif defined (__BIG_ENDIAN_BITFIELD)
++	__u8		needs_whiteout:1,
++			format:7;
++#else
++#error edit for your odd byteorder.
++#endif
++
++	/* Type of the value */
++	__u8		type;
++
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++	__u8		pad[1];
++
++	struct bversion	version;
++	__u32		size;		/* extent size, in sectors */
++	struct bpos	p;
++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++	struct bpos	p;
++	__u32		size;		/* extent size, in sectors */
++	struct bversion	version;
++
++	__u8		pad[1];
++#endif
++} __attribute__((packed, aligned(8)));
++
++struct bkey_packed {
++	__u64		_data[0];
++
++	/* Size of combined key and value, in u64s */
++	__u8		u64s;
++
++	/* Format of key (0 for format local to btree node) */
++
++	/*
++	 * XXX: next incompat on disk format change, switch format and
++	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
++	 * bits of the bitfield
++	 */
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	__u8		format:7,
++			needs_whiteout:1;
++#elif defined (__BIG_ENDIAN_BITFIELD)
++	__u8		needs_whiteout:1,
++			format:7;
++#endif
++
++	/* Type of the value */
++	__u8		type;
++	__u8		key_start[0];
++
++	/*
++	 * We copy bkeys with struct assignment in various places, and while
++	 * that shouldn't be done with packed bkeys we can't disallow it in C,
++	 * and it's legal to cast a bkey to a bkey_packed  - so padding it out
++	 * to the same size as struct bkey should hopefully be safest.
++	 */
++	__u8		pad[sizeof(struct bkey) - 3];
++} __attribute__((packed, aligned(8)));
++
++#define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
++#define BKEY_U64s_MAX			U8_MAX
++#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
++
++#define KEY_PACKED_BITS_START		24
++
++#define KEY_FORMAT_LOCAL_BTREE		0
++#define KEY_FORMAT_CURRENT		1
++
++enum bch_bkey_fields {
++	BKEY_FIELD_INODE,
++	BKEY_FIELD_OFFSET,
++	BKEY_FIELD_SNAPSHOT,
++	BKEY_FIELD_SIZE,
++	BKEY_FIELD_VERSION_HI,
++	BKEY_FIELD_VERSION_LO,
++	BKEY_NR_FIELDS,
++};
++
++#define bkey_format_field(name, field)					\
++	[BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
++
++#define BKEY_FORMAT_CURRENT						\
++((struct bkey_format) {							\
++	.key_u64s	= BKEY_U64s,					\
++	.nr_fields	= BKEY_NR_FIELDS,				\
++	.bits_per_field = {						\
++		bkey_format_field(INODE,	p.inode),		\
++		bkey_format_field(OFFSET,	p.offset),		\
++		bkey_format_field(SNAPSHOT,	p.snapshot),		\
++		bkey_format_field(SIZE,		size),			\
++		bkey_format_field(VERSION_HI,	version.hi),		\
++		bkey_format_field(VERSION_LO,	version.lo),		\
++	},								\
++})
++
++/* bkey with inline value */
++struct bkey_i {
++	__u64			_data[0];
++
++	union {
++	struct {
++		/* Size of combined key and value, in u64s */
++		__u8		u64s;
++	};
++	struct {
++		struct bkey	k;
++		struct bch_val	v;
++	};
++	};
++};
++
++#define KEY(_inode, _offset, _size)					\
++((struct bkey) {							\
++	.u64s		= BKEY_U64s,					\
++	.format		= KEY_FORMAT_CURRENT,				\
++	.p		= POS(_inode, _offset),				\
++	.size		= _size,					\
++})
++
++static inline void bkey_init(struct bkey *k)
++{
++	*k = KEY(0, 0, 0);
++}
++
++#define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
++
++#define __BKEY_PADDED(key, pad)					\
++	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
++
++/*
++ * - DELETED keys are used internally to mark keys that should be ignored but
++ *   override keys in composition order.  Their version number is ignored.
++ *
++ * - DISCARDED keys indicate that the data is all 0s because it has been
++ *   discarded. DISCARDs may have a version; if the version is nonzero the key
++ *   will be persistent, otherwise the key will be dropped whenever the btree
++ *   node is rewritten (like DELETED keys).
++ *
++ * - ERROR: any read of the data returns a read error, as the data was lost due
++ *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
++ *   by new writes or cluster-wide GC. Node repair can also overwrite them with
++ *   the same or a more recent version number, but not with an older version
++ *   number.
++ *
++ * - WHITEOUT: for hash table btrees
++*/
++#define BCH_BKEY_TYPES()				\
++	x(deleted,		0)			\
++	x(whiteout,		1)			\
++	x(error,		2)			\
++	x(cookie,		3)			\
++	x(hash_whiteout,	4)			\
++	x(btree_ptr,		5)			\
++	x(extent,		6)			\
++	x(reservation,		7)			\
++	x(inode,		8)			\
++	x(inode_generation,	9)			\
++	x(dirent,		10)			\
++	x(xattr,		11)			\
++	x(alloc,		12)			\
++	x(quota,		13)			\
++	x(stripe,		14)			\
++	x(reflink_p,		15)			\
++	x(reflink_v,		16)			\
++	x(inline_data,		17)			\
++	x(btree_ptr_v2,		18)			\
++	x(indirect_inline_data,	19)			\
++	x(alloc_v2,		20)			\
++	x(subvolume,		21)			\
++	x(snapshot,		22)			\
++	x(inode_v2,		23)			\
++	x(alloc_v3,		24)			\
++	x(set,			25)			\
++	x(lru,			26)			\
++	x(alloc_v4,		27)			\
++	x(backpointer,		28)
++
++enum bch_bkey_type {
++#define x(name, nr) KEY_TYPE_##name	= nr,
++	BCH_BKEY_TYPES()
++#undef x
++	KEY_TYPE_MAX,
++};
++
++struct bch_deleted {
++	struct bch_val		v;
++};
++
++struct bch_whiteout {
++	struct bch_val		v;
++};
++
++struct bch_error {
++	struct bch_val		v;
++};
++
++struct bch_cookie {
++	struct bch_val		v;
++	__le64			cookie;
++};
++
++struct bch_hash_whiteout {
++	struct bch_val		v;
++};
++
++struct bch_set {
++	struct bch_val		v;
++};
++
++/* Extents */
++
++/*
++ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
++ * preceded by checksum/compression information (bch_extent_crc32 or
++ * bch_extent_crc64).
++ *
++ * One major determining factor in the format of extents is how we handle and
++ * represent extents that have been partially overwritten and thus trimmed:
++ *
++ * If an extent is not checksummed or compressed, when the extent is trimmed we
++ * don't have to remember the extent we originally allocated and wrote: we can
++ * merely adjust ptr->offset to point to the start of the data that is currently
++ * live. The size field in struct bkey records the current (live) size of the
++ * extent, and is also used to mean "size of region on disk that we point to" in
++ * this case.
++ *
++ * Thus an extent that is not checksummed or compressed will consist only of a
++ * list of bch_extent_ptrs, with none of the fields in
++ * bch_extent_crc32/bch_extent_crc64.
++ *
++ * When an extent is checksummed or compressed, it's not possible to read only
++ * the data that is currently live: we have to read the entire extent that was
++ * originally written, and then return only the part of the extent that is
++ * currently live.
++ *
++ * Thus, in addition to the current size of the extent in struct bkey, we need
++ * to store the size of the originally allocated space - this is the
++ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
++ * when the extent is trimmed, instead of modifying the offset field of the
++ * pointer, we keep a second smaller offset field - "offset into the original
++ * extent of the currently live region".
++ *
++ * The other major determining factor is replication and data migration:
++ *
++ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
++ * write, we will initially write all the replicas in the same format, with the
++ * same checksum type and compression format - however, when copygc runs later (or
++ * tiering/cache promotion, anything that moves data), it is not in general
++ * going to rewrite all the pointers at once - one of the replicas may be in a
++ * bucket on one device that has very little fragmentation while another lives
++ * in a bucket that has become heavily fragmented, and thus is being rewritten
++ * sooner than the rest.
++ *
++ * Thus it will only move a subset of the pointers (or in the case of
++ * tiering/cache promotion perhaps add a single pointer without dropping any
++ * current pointers), and if the extent has been partially overwritten it must
++ * write only the currently live portion (or copygc would not be able to reduce
++ * fragmentation!) - which necessitates a different bch_extent_crc format for
++ * the new pointer.
++ *
++ * But in the interests of space efficiency, we don't want to store one
++ * bch_extent_crc for each pointer if we don't have to.
++ *
++ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
++ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
++ * type of a given entry with a scheme similar to utf8 (except we're encoding a
++ * type, not a size), encoding the type in the position of the first set bit:
++ *
++ * bch_extent_crc32	- 0b1
++ * bch_extent_ptr	- 0b10
++ * bch_extent_crc64	- 0b100
++ *
++ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
++ * bch_extent_crc64 is the least constrained).
++ *
++ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
++ * until the next bch_extent_crc32/64.
++ *
++ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
++ * is neither checksummed nor compressed.
++ */
++
++/* 128 bits, sufficient for cryptographic MACs: */
++struct bch_csum {
++	__le64			lo;
++	__le64			hi;
++} __attribute__((packed, aligned(8)));
++
++#define BCH_EXTENT_ENTRY_TYPES()		\
++	x(ptr,			0)		\
++	x(crc32,		1)		\
++	x(crc64,		2)		\
++	x(crc128,		3)		\
++	x(stripe_ptr,		4)
++#define BCH_EXTENT_ENTRY_MAX	5
++
++enum bch_extent_entry_type {
++#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
++	BCH_EXTENT_ENTRY_TYPES()
++#undef x
++};
++
++/* Compressed/uncompressed size are stored biased by 1: */
++struct bch_extent_crc32 {
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	__u32			type:2,
++				_compressed_size:7,
++				_uncompressed_size:7,
++				offset:7,
++				_unused:1,
++				csum_type:4,
++				compression_type:4;
++	__u32			csum;
++#elif defined (__BIG_ENDIAN_BITFIELD)
++	__u32			csum;
++	__u32			compression_type:4,
++				csum_type:4,
++				_unused:1,
++				offset:7,
++				_uncompressed_size:7,
++				_compressed_size:7,
++				type:2;
++#endif
++} __attribute__((packed, aligned(8)));
++
++#define CRC32_SIZE_MAX		(1U << 7)
++#define CRC32_NONCE_MAX		0
++
++struct bch_extent_crc64 {
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	__u64			type:3,
++				_compressed_size:9,
++				_uncompressed_size:9,
++				offset:9,
++				nonce:10,
++				csum_type:4,
++				compression_type:4,
++				csum_hi:16;
++#elif defined (__BIG_ENDIAN_BITFIELD)
++	__u64			csum_hi:16,
++				compression_type:4,
++				csum_type:4,
++				nonce:10,
++				offset:9,
++				_uncompressed_size:9,
++				_compressed_size:9,
++				type:3;
++#endif
++	__u64			csum_lo;
++} __attribute__((packed, aligned(8)));
++
++#define CRC64_SIZE_MAX		(1U << 9)
++#define CRC64_NONCE_MAX		((1U << 10) - 1)
++
++struct bch_extent_crc128 {
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	__u64			type:4,
++				_compressed_size:13,
++				_uncompressed_size:13,
++				offset:13,
++				nonce:13,
++				csum_type:4,
++				compression_type:4;
++#elif defined (__BIG_ENDIAN_BITFIELD)
++	__u64			compression_type:4,
++				csum_type:4,
++				nonce:13,
++				offset:13,
++				_uncompressed_size:13,
++				_compressed_size:13,
++				type:4;
++#endif
++	struct bch_csum		csum;
++} __attribute__((packed, aligned(8)));
++
++#define CRC128_SIZE_MAX		(1U << 13)
++#define CRC128_NONCE_MAX	((1U << 13) - 1)
++
++/*
++ * @reservation - pointer hasn't been written to, just reserved
++ */
++struct bch_extent_ptr {
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	__u64			type:1,
++				cached:1,
++				unused:1,
++				reservation:1,
++				offset:44, /* 8 petabytes */
++				dev:8,
++				gen:8;
++#elif defined (__BIG_ENDIAN_BITFIELD)
++	__u64			gen:8,
++				dev:8,
++				offset:44,
++				reservation:1,
++				unused:1,
++				cached:1,
++				type:1;
++#endif
++} __attribute__((packed, aligned(8)));
++
++struct bch_extent_stripe_ptr {
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	__u64			type:5,
++				block:8,
++				redundancy:4,
++				idx:47;
++#elif defined (__BIG_ENDIAN_BITFIELD)
++	__u64			idx:47,
++				redundancy:4,
++				block:8,
++				type:5;
++#endif
++};
++
++struct bch_extent_reservation {
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	__u64			type:6,
++				unused:22,
++				replicas:4,
++				generation:32;
++#elif defined (__BIG_ENDIAN_BITFIELD)
++	__u64			generation:32,
++				replicas:4,
++				unused:22,
++				type:6;
++#endif
++};
++
++union bch_extent_entry {
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
++	unsigned long			type;
++#elif __BITS_PER_LONG == 32
++	struct {
++		unsigned long		pad;
++		unsigned long		type;
++	};
++#else
++#error edit for your odd byteorder.
++#endif
++
++#define x(f, n) struct bch_extent_##f	f;
++	BCH_EXTENT_ENTRY_TYPES()
++#undef x
++};
++
++struct bch_btree_ptr {
++	struct bch_val		v;
++
++	__u64			_data[0];
++	struct bch_extent_ptr	start[];
++} __attribute__((packed, aligned(8)));
++
++struct bch_btree_ptr_v2 {
++	struct bch_val		v;
++
++	__u64			mem_ptr;
++	__le64			seq;
++	__le16			sectors_written;
++	__le16			flags;
++	struct bpos		min_key;
++	__u64			_data[0];
++	struct bch_extent_ptr	start[];
++} __attribute__((packed, aligned(8)));
++
++LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
++
++struct bch_extent {
++	struct bch_val		v;
++
++	__u64			_data[0];
++	union bch_extent_entry	start[];
++} __attribute__((packed, aligned(8)));
++
++struct bch_reservation {
++	struct bch_val		v;
++
++	__le32			generation;
++	__u8			nr_replicas;
++	__u8			pad[3];
++} __attribute__((packed, aligned(8)));
++
++/* Maximum size (in u64s) a single pointer could be: */
++#define BKEY_EXTENT_PTR_U64s_MAX\
++	((sizeof(struct bch_extent_crc128) +			\
++	  sizeof(struct bch_extent_ptr)) / sizeof(u64))
++
++/* Maximum possible size of an entire extent value: */
++#define BKEY_EXTENT_VAL_U64s_MAX				\
++	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
++
++/* * Maximum possible size of an entire extent, key + value: */
++#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
++
++/* Btree pointers don't carry around checksums: */
++#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
++	((sizeof(struct bch_btree_ptr_v2) +			\
++	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64))
++#define BKEY_BTREE_PTR_U64s_MAX					\
++	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
++
++/* Inodes */
++
++#define BLOCKDEV_INODE_MAX	4096
++
++#define BCACHEFS_ROOT_INO	4096
++
++struct bch_inode {
++	struct bch_val		v;
++
++	__le64			bi_hash_seed;
++	__le32			bi_flags;
++	__le16			bi_mode;
++	__u8			fields[0];
++} __attribute__((packed, aligned(8)));
++
++struct bch_inode_v2 {
++	struct bch_val		v;
++
++	__le64			bi_journal_seq;
++	__le64			bi_hash_seed;
++	__le64			bi_flags;
++	__le16			bi_mode;
++	__u8			fields[0];
++} __attribute__((packed, aligned(8)));
++
++struct bch_inode_generation {
++	struct bch_val		v;
++
++	__le32			bi_generation;
++	__le32			pad;
++} __attribute__((packed, aligned(8)));
++
++/*
++ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
++ */
++
++#define BCH_INODE_FIELDS()			\
++	x(bi_atime,			96)	\
++	x(bi_ctime,			96)	\
++	x(bi_mtime,			96)	\
++	x(bi_otime,			96)	\
++	x(bi_size,			64)	\
++	x(bi_sectors,			64)	\
++	x(bi_uid,			32)	\
++	x(bi_gid,			32)	\
++	x(bi_nlink,			32)	\
++	x(bi_generation,		32)	\
++	x(bi_dev,			32)	\
++	x(bi_data_checksum,		8)	\
++	x(bi_compression,		8)	\
++	x(bi_project,			32)	\
++	x(bi_background_compression,	8)	\
++	x(bi_data_replicas,		8)	\
++	x(bi_promote_target,		16)	\
++	x(bi_foreground_target,		16)	\
++	x(bi_background_target,		16)	\
++	x(bi_erasure_code,		16)	\
++	x(bi_fields_set,		16)	\
++	x(bi_dir,			64)	\
++	x(bi_dir_offset,		64)	\
++	x(bi_subvol,			32)	\
++	x(bi_parent_subvol,		32)
++
++/* subset of BCH_INODE_FIELDS */
++#define BCH_INODE_OPTS()			\
++	x(data_checksum,		8)	\
++	x(compression,			8)	\
++	x(project,			32)	\
++	x(background_compression,	8)	\
++	x(data_replicas,		8)	\
++	x(promote_target,		16)	\
++	x(foreground_target,		16)	\
++	x(background_target,		16)	\
++	x(erasure_code,			16)
++
++enum inode_opt_id {
++#define x(name, ...)				\
++	Inode_opt_##name,
++	BCH_INODE_OPTS()
++#undef  x
++	Inode_opt_nr,
++};
++
++enum {
++	/*
++	 * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
++	 * flags)
++	 */
++	__BCH_INODE_SYNC	= 0,
++	__BCH_INODE_IMMUTABLE	= 1,
++	__BCH_INODE_APPEND	= 2,
++	__BCH_INODE_NODUMP	= 3,
++	__BCH_INODE_NOATIME	= 4,
++
++	__BCH_INODE_I_SIZE_DIRTY= 5,
++	__BCH_INODE_I_SECTORS_DIRTY= 6,
++	__BCH_INODE_UNLINKED	= 7,
++	__BCH_INODE_BACKPTR_UNTRUSTED = 8,
++
++	/* bits 20+ reserved for packed fields below: */
++};
++
++#define BCH_INODE_SYNC		(1 << __BCH_INODE_SYNC)
++#define BCH_INODE_IMMUTABLE	(1 << __BCH_INODE_IMMUTABLE)
++#define BCH_INODE_APPEND	(1 << __BCH_INODE_APPEND)
++#define BCH_INODE_NODUMP	(1 << __BCH_INODE_NODUMP)
++#define BCH_INODE_NOATIME	(1 << __BCH_INODE_NOATIME)
++#define BCH_INODE_I_SIZE_DIRTY	(1 << __BCH_INODE_I_SIZE_DIRTY)
++#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
++#define BCH_INODE_UNLINKED	(1 << __BCH_INODE_UNLINKED)
++#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
++
++LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
++LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
++LE32_BITMASK(INODE_NEW_VARINT,	struct bch_inode, bi_flags, 31, 32);
++
++LE64_BITMASK(INODEv2_STR_HASH,	struct bch_inode_v2, bi_flags, 20, 24);
++LE64_BITMASK(INODEv2_NR_FIELDS,	struct bch_inode_v2, bi_flags, 24, 31);
++
++/* Dirents */
++
++/*
++ * Dirents (and xattrs) have to implement string lookups; since our b-tree
++ * doesn't support arbitrary length strings for the key, we instead index by a
++ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
++ * field of the key - using linear probing to resolve hash collisions. This also
++ * provides us with the readdir cookie posix requires.
++ *
++ * Linear probing requires us to use whiteouts for deletions, in the event of a
++ * collision:
++ */
++
++struct bch_dirent {
++	struct bch_val		v;
++
++	/* Target inode number: */
++	union {
++	__le64			d_inum;
++	struct {		/* DT_SUBVOL */
++	__le32			d_child_subvol;
++	__le32			d_parent_subvol;
++	};
++	};
++
++	/*
++	 * Copy of mode bits 12-15 from the target inode - so userspace can get
++	 * the filetype without having to do a stat()
++	 */
++	__u8			d_type;
++
++	__u8			d_name[];
++} __attribute__((packed, aligned(8)));
++
++#define DT_SUBVOL	16
++#define BCH_DT_MAX	17
++
++#define BCH_NAME_MAX	((unsigned) (U8_MAX * sizeof(u64) -		\
++			 sizeof(struct bkey) -				\
++			 offsetof(struct bch_dirent, d_name)))
++
++/* Xattrs */
++
++#define KEY_TYPE_XATTR_INDEX_USER			0
++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS	1
++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT	2
++#define KEY_TYPE_XATTR_INDEX_TRUSTED			3
++#define KEY_TYPE_XATTR_INDEX_SECURITY	        4
++
++struct bch_xattr {
++	struct bch_val		v;
++	__u8			x_type;
++	__u8			x_name_len;
++	__le16			x_val_len;
++	__u8			x_name[];
++} __attribute__((packed, aligned(8)));
++
++/* Bucket/allocation information: */
++
++struct bch_alloc {
++	struct bch_val		v;
++	__u8			fields;
++	__u8			gen;
++	__u8			data[];
++} __attribute__((packed, aligned(8)));
++
++#define BCH_ALLOC_FIELDS_V1()			\
++	x(read_time,		16)		\
++	x(write_time,		16)		\
++	x(data_type,		8)		\
++	x(dirty_sectors,	16)		\
++	x(cached_sectors,	16)		\
++	x(oldest_gen,		8)		\
++	x(stripe,		32)		\
++	x(stripe_redundancy,	8)
++
++enum {
++#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
++	BCH_ALLOC_FIELDS_V1()
++#undef x
++};
++
++struct bch_alloc_v2 {
++	struct bch_val		v;
++	__u8			nr_fields;
++	__u8			gen;
++	__u8			oldest_gen;
++	__u8			data_type;
++	__u8			data[];
++} __attribute__((packed, aligned(8)));
++
++#define BCH_ALLOC_FIELDS_V2()			\
++	x(read_time,		64)		\
++	x(write_time,		64)		\
++	x(dirty_sectors,	32)		\
++	x(cached_sectors,	32)		\
++	x(stripe,		32)		\
++	x(stripe_redundancy,	8)
++
++struct bch_alloc_v3 {
++	struct bch_val		v;
++	__le64			journal_seq;
++	__le32			flags;
++	__u8			nr_fields;
++	__u8			gen;
++	__u8			oldest_gen;
++	__u8			data_type;
++	__u8			data[];
++} __attribute__((packed, aligned(8)));
++
++LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
++LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
++
++struct bch_alloc_v4 {
++	struct bch_val		v;
++	__u64			journal_seq;
++	__u32			flags;
++	__u8			gen;
++	__u8			oldest_gen;
++	__u8			data_type;
++	__u8			stripe_redundancy;
++	__u32			dirty_sectors;
++	__u32			cached_sectors;
++	__u64			io_time[2];
++	__u32			stripe;
++	__u32			nr_external_backpointers;
++} __attribute__((packed, aligned(8)));
++
++#define BCH_ALLOC_V4_U64s_V0	6
++#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(u64))
++
++BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
++BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
++BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
++BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)
++
++#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX	40
++
++struct bch_backpointer {
++	struct bch_val		v;
++	__u8			btree_id;
++	__u8			level;
++	__u8			data_type;
++	__u64			bucket_offset:40;
++	__u32			bucket_len;
++	struct bpos		pos;
++} __attribute__((packed, aligned(8)));
++
++/* Quotas: */
++
++enum quota_types {
++	QTYP_USR		= 0,
++	QTYP_GRP		= 1,
++	QTYP_PRJ		= 2,
++	QTYP_NR			= 3,
++};
++
++enum quota_counters {
++	Q_SPC			= 0,
++	Q_INO			= 1,
++	Q_COUNTERS		= 2,
++};
++
++struct bch_quota_counter {
++	__le64			hardlimit;
++	__le64			softlimit;
++};
++
++struct bch_quota {
++	struct bch_val		v;
++	struct bch_quota_counter c[Q_COUNTERS];
++} __attribute__((packed, aligned(8)));
++
++/* Erasure coding */
++
++struct bch_stripe {
++	struct bch_val		v;
++	__le16			sectors;
++	__u8			algorithm;
++	__u8			nr_blocks;
++	__u8			nr_redundant;
++
++	__u8			csum_granularity_bits;
++	__u8			csum_type;
++	__u8			pad;
++
++	struct bch_extent_ptr	ptrs[];
++} __attribute__((packed, aligned(8)));
++
++/* Reflink: */
++
++struct bch_reflink_p {
++	struct bch_val		v;
++	__le64			idx;
++	/*
++	 * A reflink pointer might point to an indirect extent which is then
++	 * later split (by copygc or rebalance). If we only pointed to part of
++	 * the original indirect extent, and then one of the fragments is
++	 * outside the range we point to, we'd leak a refcount: so when creating
++	 * reflink pointers, we need to store pad values to remember the full
++	 * range we were taking a reference on.
++	 */
++	__le32			front_pad;
++	__le32			back_pad;
++} __attribute__((packed, aligned(8)));
++
++struct bch_reflink_v {
++	struct bch_val		v;
++	__le64			refcount;
++	union bch_extent_entry	start[0];
++	__u64			_data[0];
++} __attribute__((packed, aligned(8)));
++
++struct bch_indirect_inline_data {
++	struct bch_val		v;
++	__le64			refcount;
++	u8			data[0];
++};
++
++/* Inline data */
++
++struct bch_inline_data {
++	struct bch_val		v;
++	u8			data[0];
++};
++
++/* Subvolumes: */
++
++#define SUBVOL_POS_MIN		POS(0, 1)
++#define SUBVOL_POS_MAX		POS(0, S32_MAX)
++#define BCACHEFS_ROOT_SUBVOL	1
++
++struct bch_subvolume {
++	struct bch_val		v;
++	__le32			flags;
++	__le32			snapshot;
++	__le64			inode;
++};
++
++LE32_BITMASK(BCH_SUBVOLUME_RO,		struct bch_subvolume, flags,  0,  1)
++/*
++ * We need to know whether a subvolume is a snapshot so we can know whether we
++ * can delete it (or whether it should just be rm -rf'd)
++ */
++LE32_BITMASK(BCH_SUBVOLUME_SNAP,	struct bch_subvolume, flags,  1,  2)
++LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,	struct bch_subvolume, flags,  2,  3)
++
++/* Snapshots */
++
++struct bch_snapshot {
++	struct bch_val		v;
++	__le32			flags;
++	__le32			parent;
++	__le32			children[2];
++	__le32			subvol;
++	__le32			pad;
++};
++
++LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
++
++/* True if a subvolume points to this snapshot node: */
++LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
++
++/* LRU btree: */
++
++struct bch_lru {
++	struct bch_val		v;
++	__le64			idx;
++} __attribute__((packed, aligned(8)));
++
++#define LRU_ID_STRIPES		(1U << 16)
++
++/* Optional/variable size superblock sections: */
++
++struct bch_sb_field {
++	__u64			_data[0];
++	__le32			u64s;
++	__le32			type;
++};
++
++#define BCH_SB_FIELDS()				\
++	x(journal,	0)			\
++	x(members,	1)			\
++	x(crypt,	2)			\
++	x(replicas_v0,	3)			\
++	x(quota,	4)			\
++	x(disk_groups,	5)			\
++	x(clean,	6)			\
++	x(replicas,	7)			\
++	x(journal_seq_blacklist, 8)		\
++	x(journal_v2,	9)			\
++	x(counters,	10)
++
++enum bch_sb_field_type {
++#define x(f, nr)	BCH_SB_FIELD_##f = nr,
++	BCH_SB_FIELDS()
++#undef x
++	BCH_SB_FIELD_NR
++};
++
++/*
++ * Most superblock fields are replicated in all device's superblocks - a few are
++ * not:
++ */
++#define BCH_SINGLE_DEVICE_SB_FIELDS		\
++	((1U << BCH_SB_FIELD_journal)|		\
++	 (1U << BCH_SB_FIELD_journal_v2))
++
++/* BCH_SB_FIELD_journal: */
++
++struct bch_sb_field_journal {
++	struct bch_sb_field	field;
++	__le64			buckets[0];
++};
++
++struct bch_sb_field_journal_v2 {
++	struct bch_sb_field	field;
++
++	struct bch_sb_field_journal_v2_entry {
++		__le64		start;
++		__le64		nr;
++	}			d[0];
++};
++
++/* BCH_SB_FIELD_members: */
++
++#define BCH_MIN_NR_NBUCKETS	(1 << 6)
++
++struct bch_member {
++	uuid_le			uuid;
++	__le64			nbuckets;	/* device size */
++	__le16			first_bucket;   /* index of first bucket used */
++	__le16			bucket_size;	/* sectors */
++	__le32			pad;
++	__le64			last_mount;	/* time_t */
++
++	__le64			flags[2];
++};
++
++LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags[0],  0,  4)
++/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
++LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15)
++LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20)
++LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags[0], 20, 28)
++LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags[0], 28, 30)
++LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
++					struct bch_member, flags[0], 30, 31)
++
++#if 0
++LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
++LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
++#endif
++
++#define BCH_MEMBER_STATES()			\
++	x(rw,		0)			\
++	x(ro,		1)			\
++	x(failed,	2)			\
++	x(spare,	3)
++
++enum bch_member_state {
++#define x(t, n) BCH_MEMBER_STATE_##t = n,
++	BCH_MEMBER_STATES()
++#undef x
++	BCH_MEMBER_STATE_NR
++};
++
++struct bch_sb_field_members {
++	struct bch_sb_field	field;
++	struct bch_member	members[0];
++};
++
++/* BCH_SB_FIELD_crypt: */
++
++struct nonce {
++	__le32			d[4];
++};
++
++struct bch_key {
++	__le64			key[4];
++};
++
++#define BCH_KEY_MAGIC					\
++	(((u64) 'b' <<  0)|((u64) 'c' <<  8)|		\
++	 ((u64) 'h' << 16)|((u64) '*' << 24)|		\
++	 ((u64) '*' << 32)|((u64) 'k' << 40)|		\
++	 ((u64) 'e' << 48)|((u64) 'y' << 56))
++
++struct bch_encrypted_key {
++	__le64			magic;
++	struct bch_key		key;
++};
++
++/*
++ * If this field is present in the superblock, it stores an encryption key which
++ * is used encrypt all other data/metadata. The key will normally be encrypted
++ * with the key userspace provides, but if encryption has been turned off we'll
++ * just store the master key unencrypted in the superblock so we can access the
++ * previously encrypted data.
++ */
++struct bch_sb_field_crypt {
++	struct bch_sb_field	field;
++
++	__le64			flags;
++	__le64			kdf_flags;
++	struct bch_encrypted_key key;
++};
++
++LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);
++
++enum bch_kdf_types {
++	BCH_KDF_SCRYPT		= 0,
++	BCH_KDF_NR		= 1,
++};
++
++/* stored as base 2 log of scrypt params: */
++LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
++LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
++LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
++
++/* BCH_SB_FIELD_replicas: */
++
++#define BCH_DATA_TYPES()		\
++	x(free,		0)		\
++	x(sb,		1)		\
++	x(journal,	2)		\
++	x(btree,	3)		\
++	x(user,		4)		\
++	x(cached,	5)		\
++	x(parity,	6)		\
++	x(stripe,	7)		\
++	x(need_gc_gens,	8)		\
++	x(need_discard,	9)
++
++enum bch_data_type {
++#define x(t, n) BCH_DATA_##t,
++	BCH_DATA_TYPES()
++#undef x
++	BCH_DATA_NR
++};
++
++static inline bool data_type_is_empty(enum bch_data_type type)
++{
++	switch (type) {
++	case BCH_DATA_free:
++	case BCH_DATA_need_gc_gens:
++	case BCH_DATA_need_discard:
++		return true;
++	default:
++		return false;
++	}
++}
++
++static inline bool data_type_is_hidden(enum bch_data_type type)
++{
++	switch (type) {
++	case BCH_DATA_sb:
++	case BCH_DATA_journal:
++		return true;
++	default:
++		return false;
++	}
++}
++
++struct bch_replicas_entry_v0 {
++	__u8			data_type;
++	__u8			nr_devs;
++	__u8			devs[];
++} __attribute__((packed));
++
++struct bch_sb_field_replicas_v0 {
++	struct bch_sb_field	field;
++	struct bch_replicas_entry_v0 entries[];
++} __attribute__((packed, aligned(8)));
++
++struct bch_replicas_entry {
++	__u8			data_type;
++	__u8			nr_devs;
++	__u8			nr_required;
++	__u8			devs[];
++} __attribute__((packed));
++
++#define replicas_entry_bytes(_i)					\
++	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
++
++struct bch_sb_field_replicas {
++	struct bch_sb_field	field;
++	struct bch_replicas_entry entries[0];
++} __attribute__((packed, aligned(8)));
++
++/* BCH_SB_FIELD_quota: */
++
++struct bch_sb_quota_counter {
++	__le32				timelimit;
++	__le32				warnlimit;
++};
++
++struct bch_sb_quota_type {
++	__le64				flags;
++	struct bch_sb_quota_counter	c[Q_COUNTERS];
++};
++
++struct bch_sb_field_quota {
++	struct bch_sb_field		field;
++	struct bch_sb_quota_type	q[QTYP_NR];
++} __attribute__((packed, aligned(8)));
++
++/* BCH_SB_FIELD_disk_groups: */
++
++#define BCH_SB_LABEL_SIZE		32
++
++struct bch_disk_group {
++	__u8			label[BCH_SB_LABEL_SIZE];
++	__le64			flags[2];
++} __attribute__((packed, aligned(8)));
++
++LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
++LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
++LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
++
++struct bch_sb_field_disk_groups {
++	struct bch_sb_field	field;
++	struct bch_disk_group	entries[0];
++} __attribute__((packed, aligned(8)));
++
++/* BCH_SB_FIELD_counters */
++
++#define BCH_PERSISTENT_COUNTERS()			\
++	x(io_read,		0)			\
++	x(io_write,		1)			\
++	x(io_move,		2)			\
++	x(bucket_invalidate,	3)			\
++	x(bucket_discard,	4)
++
++enum bch_persistent_counters {
++#define x(t, n, ...) BCH_COUNTER_##t,
++	BCH_PERSISTENT_COUNTERS()
++#undef x
++	BCH_COUNTER_NR
++};
++
++struct bch_sb_field_counters {
++	struct bch_sb_field	field;
++	__le64			d[0];
++};
++
++/*
++ * On clean shutdown, store btree roots and current journal sequence number in
++ * the superblock:
++ */
++struct jset_entry {
++	__le16			u64s;
++	__u8			btree_id;
++	__u8			level;
++	__u8			type; /* designates what this jset holds */
++	__u8			pad[3];
++
++	union {
++		struct bkey_i	start[0];
++		__u64		_data[0];
++	};
++};
++
++struct bch_sb_field_clean {
++	struct bch_sb_field	field;
++
++	__le32			flags;
++	__le16			_read_clock; /* no longer used */
++	__le16			_write_clock;
++	__le64			journal_seq;
++
++	union {
++		struct jset_entry start[0];
++		__u64		_data[0];
++	};
++};
++
++struct journal_seq_blacklist_entry {
++	__le64			start;
++	__le64			end;
++};
++
++struct bch_sb_field_journal_seq_blacklist {
++	struct bch_sb_field	field;
++
++	union {
++		struct journal_seq_blacklist_entry start[0];
++		__u64		_data[0];
++	};
++};
++
++/* Superblock: */
++
++/*
++ * New versioning scheme:
++ * One common version number for all on disk data structures - superblock, btree
++ * nodes, journal entries
++ */
++#define BCH_JSET_VERSION_OLD			2
++#define BCH_BSET_VERSION_OLD			3
++
++#define BCH_METADATA_VERSIONS()				\
++	x(bkey_renumber,		10)		\
++	x(inode_btree_change,		11)		\
++	x(snapshot,			12)		\
++	x(inode_backpointers,		13)		\
++	x(btree_ptr_sectors_written,	14)		\
++	x(snapshot_2,			15)		\
++	x(reflink_p_fix,		16)		\
++	x(subvol_dirent,		17)		\
++	x(inode_v2,			18)		\
++	x(freespace,			19)		\
++	x(alloc_v4,			20)		\
++	x(new_data_types,		21)		\
++	x(backpointers,			22)
++
++enum bcachefs_metadata_version {
++	bcachefs_metadata_version_min = 9,
++#define x(t, n)	bcachefs_metadata_version_##t = n,
++	BCH_METADATA_VERSIONS()
++#undef x
++	bcachefs_metadata_version_max
++};
++
++#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
++
++#define BCH_SB_SECTOR			8
++#define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
++
++struct bch_sb_layout {
++	uuid_le			magic;	/* bcachefs superblock UUID */
++	__u8			layout_type;
++	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
++	__u8			nr_superblocks;
++	__u8			pad[5];
++	__le64			sb_offset[61];
++} __attribute__((packed, aligned(8)));
++
++#define BCH_SB_LAYOUT_SECTOR	7
++
++/*
++ * @offset	- sector where this sb was written
++ * @version	- on disk format version
++ * @version_min	- Oldest metadata version this filesystem contains; so we can
++ *		  safely drop compatibility code and refuse to mount filesystems
++ *		  we'd need it for
++ * @magic	- identifies as a bcachefs superblock (BCACHE_MAGIC)
++ * @seq		- incremented each time superblock is written
++ * @uuid	- used for generating various magic numbers and identifying
++ *                member devices, never changes
++ * @user_uuid	- user visible UUID, may be changed
++ * @label	- filesystem label
++ * @seq		- identifies most recent superblock, incremented each time
++ *		  superblock is written
++ * @features	- enabled incompatible features
++ */
++struct bch_sb {
++	struct bch_csum		csum;
++	__le16			version;
++	__le16			version_min;
++	__le16			pad[2];
++	uuid_le			magic;
++	uuid_le			uuid;
++	uuid_le			user_uuid;
++	__u8			label[BCH_SB_LABEL_SIZE];
++	__le64			offset;
++	__le64			seq;
++
++	__le16			block_size;
++	__u8			dev_idx;
++	__u8			nr_devices;
++	__le32			u64s;
++
++	__le64			time_base_lo;
++	__le32			time_base_hi;
++	__le32			time_precision;
++
++	__le64			flags[8];
++	__le64			features[2];
++	__le64			compat[2];
++
++	struct bch_sb_layout	layout;
++
++	union {
++		struct bch_sb_field start[0];
++		__le64		_data[0];
++	};
++} __attribute__((packed, aligned(8)));
++
++/*
++ * Flags:
++ * BCH_SB_INITALIZED	- set on first mount
++ * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
++ *			  behaviour of mount/recovery path:
++ * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
++ * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
++ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
++ *			   DATA/META_CSUM_TYPE. Also indicates encryption
++ *			   algorithm in use, if/when we get more than one
++ */
++
++LE16_BITMASK(BCH_SB_BLOCK_SIZE,		struct bch_sb, block_size, 0, 16);
++
++LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
++LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
++LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
++LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);
++
++LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);
++
++LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
++LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);
++
++LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
++LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);
++
++LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
++LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
++
++LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
++LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
++LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
++LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
++
++LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
++LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
++
++LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
++
++LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
++LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,	struct bch_sb, flags[1],  4,  8);
++LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
++
++LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
++LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
++
++/*
++ * Max size of an extent that may require bouncing to read or write
++ * (checksummed, compressed): 64k
++ */
++LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
++					struct bch_sb, flags[1], 14, 20);
++
++LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
++LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
++
++LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
++LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
++LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
++
++LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
++					struct bch_sb, flags[2],  0,  4);
++LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
++
++LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
++LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
++LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
++LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
++LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
++/* Obsolete, always enabled: */
++LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
++
++/*
++ * Features:
++ *
++ * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
++ * reflink:			gates KEY_TYPE_reflink
++ * inline_data:			gates KEY_TYPE_inline_data
++ * new_siphash:			gates BCH_STR_HASH_siphash
++ * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
++ */
++#define BCH_SB_FEATURES()			\
++	x(lz4,				0)	\
++	x(gzip,				1)	\
++	x(zstd,				2)	\
++	x(atomic_nlink,			3)	\
++	x(ec,				4)	\
++	x(journal_seq_blacklist_v3,	5)	\
++	x(reflink,			6)	\
++	x(new_siphash,			7)	\
++	x(inline_data,			8)	\
++	x(new_extent_overwrite,		9)	\
++	x(incompressible,		10)	\
++	x(btree_ptr_v2,			11)	\
++	x(extents_above_btree_updates,	12)	\
++	x(btree_updates_journalled,	13)	\
++	x(reflink_inline_data,		14)	\
++	x(new_varint,			15)	\
++	x(journal_no_flush,		16)	\
++	x(alloc_v2,			17)	\
++	x(extents_across_btree_nodes,	18)
++
++#define BCH_SB_FEATURES_ALWAYS				\
++	((1ULL << BCH_FEATURE_new_extent_overwrite)|	\
++	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
++	 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
++	 (1ULL << BCH_FEATURE_alloc_v2)|\
++	 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
++
++#define BCH_SB_FEATURES_ALL				\
++	(BCH_SB_FEATURES_ALWAYS|			\
++	 (1ULL << BCH_FEATURE_new_siphash)|		\
++	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
++	 (1ULL << BCH_FEATURE_new_varint)|		\
++	 (1ULL << BCH_FEATURE_journal_no_flush))
++
++enum bch_sb_feature {
++#define x(f, n) BCH_FEATURE_##f,
++	BCH_SB_FEATURES()
++#undef x
++	BCH_FEATURE_NR,
++};
++
++#define BCH_SB_COMPAT()					\
++	x(alloc_info,				0)	\
++	x(alloc_metadata,			1)	\
++	x(extents_above_btree_updates_done,	2)	\
++	x(bformat_overflow_done,		3)
++
++enum bch_sb_compat {
++#define x(f, n) BCH_COMPAT_##f,
++	BCH_SB_COMPAT()
++#undef x
++	BCH_COMPAT_NR,
++};
++
++/* options: */
++
++#define BCH_REPLICAS_MAX		4U
++
++#define BCH_BKEY_PTRS_MAX		16U
++
++#define BCH_ERROR_ACTIONS()		\
++	x(continue,		0)	\
++	x(ro,			1)	\
++	x(panic,		2)
++
++enum bch_error_actions {
++#define x(t, n) BCH_ON_ERROR_##t = n,
++	BCH_ERROR_ACTIONS()
++#undef x
++	BCH_ON_ERROR_NR
++};
++
++#define BCH_STR_HASH_TYPES()		\
++	x(crc32c,		0)	\
++	x(crc64,		1)	\
++	x(siphash_old,		2)	\
++	x(siphash,		3)
++
++enum bch_str_hash_type {
++#define x(t, n) BCH_STR_HASH_##t = n,
++	BCH_STR_HASH_TYPES()
++#undef x
++	BCH_STR_HASH_NR
++};
++
++#define BCH_STR_HASH_OPTS()		\
++	x(crc32c,		0)	\
++	x(crc64,		1)	\
++	x(siphash,		2)
++
++enum bch_str_hash_opts {
++#define x(t, n) BCH_STR_HASH_OPT_##t = n,
++	BCH_STR_HASH_OPTS()
++#undef x
++	BCH_STR_HASH_OPT_NR
++};
++
++#define BCH_CSUM_TYPES()			\
++	x(none,				0)	\
++	x(crc32c_nonzero,		1)	\
++	x(crc64_nonzero,		2)	\
++	x(chacha20_poly1305_80,		3)	\
++	x(chacha20_poly1305_128,	4)	\
++	x(crc32c,			5)	\
++	x(crc64,			6)	\
++	x(xxhash,			7)
++
++enum bch_csum_type {
++#define x(t, n) BCH_CSUM_##t = n,
++	BCH_CSUM_TYPES()
++#undef x
++	BCH_CSUM_NR
++};
++
++static const unsigned bch_crc_bytes[] = {
++	[BCH_CSUM_none]				= 0,
++	[BCH_CSUM_crc32c_nonzero]		= 4,
++	[BCH_CSUM_crc32c]			= 4,
++	[BCH_CSUM_crc64_nonzero]		= 8,
++	[BCH_CSUM_crc64]			= 8,
++	[BCH_CSUM_xxhash]			= 8,
++	[BCH_CSUM_chacha20_poly1305_80]		= 10,
++	[BCH_CSUM_chacha20_poly1305_128]	= 16,
++};
++
++static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
++{
++	switch (type) {
++	case BCH_CSUM_chacha20_poly1305_80:
++	case BCH_CSUM_chacha20_poly1305_128:
++		return true;
++	default:
++		return false;
++	}
++}
++
++#define BCH_CSUM_OPTS()			\
++	x(none,			0)	\
++	x(crc32c,		1)	\
++	x(crc64,		2)	\
++	x(xxhash,		3)
++
++enum bch_csum_opts {
++#define x(t, n) BCH_CSUM_OPT_##t = n,
++	BCH_CSUM_OPTS()
++#undef x
++	BCH_CSUM_OPT_NR
++};
++
++#define BCH_COMPRESSION_TYPES()		\
++	x(none,			0)	\
++	x(lz4_old,		1)	\
++	x(gzip,			2)	\
++	x(lz4,			3)	\
++	x(zstd,			4)	\
++	x(incompressible,	5)
++
++enum bch_compression_type {
++#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
++	BCH_COMPRESSION_TYPES()
++#undef x
++	BCH_COMPRESSION_TYPE_NR
++};
++
++#define BCH_COMPRESSION_OPTS()		\
++	x(none,		0)		\
++	x(lz4,		1)		\
++	x(gzip,		2)		\
++	x(zstd,		3)
++
++enum bch_compression_opts {
++#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
++	BCH_COMPRESSION_OPTS()
++#undef x
++	BCH_COMPRESSION_OPT_NR
++};
++
++/*
++ * Magic numbers
++ *
++ * The various other data structures have their own magic numbers, which are
++ * xored with the first part of the cache set's UUID
++ */
++
++#define BCACHE_MAGIC							\
++	UUID_LE(0xf67385c6, 0x1a4e, 0xca45,				\
++		0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
++
++#define BCACHEFS_STATFS_MAGIC		0xca451a4e
++
++#define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
++#define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
++
++static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
++{
++	__le64 ret;
++	memcpy(&ret, &sb->uuid, sizeof(ret));
++	return ret;
++}
++
++static inline __u64 __jset_magic(struct bch_sb *sb)
++{
++	return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
++}
++
++static inline __u64 __bset_magic(struct bch_sb *sb)
++{
++	return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
++}
++
++/* Journal */
++
++#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
++
++#define BCH_JSET_ENTRY_TYPES()			\
++	x(btree_keys,		0)		\
++	x(btree_root,		1)		\
++	x(prio_ptrs,		2)		\
++	x(blacklist,		3)		\
++	x(blacklist_v2,		4)		\
++	x(usage,		5)		\
++	x(data_usage,		6)		\
++	x(clock,		7)		\
++	x(dev_usage,		8)		\
++	x(log,			9)		\
++	x(overwrite,		10)
++
++enum {
++#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
++	BCH_JSET_ENTRY_TYPES()
++#undef x
++	BCH_JSET_ENTRY_NR
++};
++
++/*
++ * Journal sequence numbers can be blacklisted: bsets record the max sequence
++ * number of all the journal entries they contain updates for, so that on
++ * recovery we can ignore those bsets that contain index updates newer that what
++ * made it into the journal.
++ *
++ * This means that we can't reuse that journal_seq - we have to skip it, and
++ * then record that we skipped it so that the next time we crash and recover we
++ * don't think there was a missing journal entry.
++ */
++struct jset_entry_blacklist {
++	struct jset_entry	entry;
++	__le64			seq;
++};
++
++struct jset_entry_blacklist_v2 {
++	struct jset_entry	entry;
++	__le64			start;
++	__le64			end;
++};
++
++#define BCH_FS_USAGE_TYPES()			\
++	x(reserved,		0)		\
++	x(inodes,		1)		\
++	x(key_version,		2)
++
++enum {
++#define x(f, nr)	BCH_FS_USAGE_##f	= nr,
++	BCH_FS_USAGE_TYPES()
++#undef x
++	BCH_FS_USAGE_NR
++};
++
++struct jset_entry_usage {
++	struct jset_entry	entry;
++	__le64			v;
++} __attribute__((packed));
++
++struct jset_entry_data_usage {
++	struct jset_entry	entry;
++	__le64			v;
++	struct bch_replicas_entry r;
++} __attribute__((packed));
++
++struct jset_entry_clock {
++	struct jset_entry	entry;
++	__u8			rw;
++	__u8			pad[7];
++	__le64			time;
++} __attribute__((packed));
++
++struct jset_entry_dev_usage_type {
++	__le64			buckets;
++	__le64			sectors;
++	__le64			fragmented;
++} __attribute__((packed));
++
++struct jset_entry_dev_usage {
++	struct jset_entry	entry;
++	__le32			dev;
++	__u32			pad;
++
++	__le64			buckets_ec;
++	__le64			_buckets_unavailable; /* No longer used */
++
++	struct jset_entry_dev_usage_type d[];
++} __attribute__((packed));
++
++static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
++{
++	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
++		sizeof(struct jset_entry_dev_usage_type);
++}
++
++struct jset_entry_log {
++	struct jset_entry	entry;
++	u8			d[];
++} __attribute__((packed));
++
++/*
++ * On disk format for a journal entry:
++ * seq is monotonically increasing; every journal entry has its own unique
++ * sequence number.
++ *
++ * last_seq is the oldest journal entry that still has keys the btree hasn't
++ * flushed to disk yet.
++ *
++ * version is for on disk format changes.
++ */
++struct jset {
++	struct bch_csum		csum;
++
++	__le64			magic;
++	__le64			seq;
++	__le32			version;
++	__le32			flags;
++
++	__le32			u64s; /* size of d[] in u64s */
++
++	__u8			encrypted_start[0];
++
++	__le16			_read_clock; /* no longer used */
++	__le16			_write_clock;
++
++	/* Sequence number of oldest dirty journal entry */
++	__le64			last_seq;
++
++
++	union {
++		struct jset_entry start[0];
++		__u64		_data[0];
++	};
++} __attribute__((packed, aligned(8)));
++
++LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
++LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
++LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
++
++#define BCH_JOURNAL_BUCKETS_MIN		8
++
++/* Btree: */
++
++#define BCH_BTREE_IDS()				\
++	x(extents,	0)			\
++	x(inodes,	1)			\
++	x(dirents,	2)			\
++	x(xattrs,	3)			\
++	x(alloc,	4)			\
++	x(quotas,	5)			\
++	x(stripes,	6)			\
++	x(reflink,	7)			\
++	x(subvolumes,	8)			\
++	x(snapshots,	9)			\
++	x(lru,		10)			\
++	x(freespace,	11)			\
++	x(need_discard,	12)			\
++	x(backpointers,	13)
++
++enum btree_id {
++#define x(kwd, val) BTREE_ID_##kwd = val,
++	BCH_BTREE_IDS()
++#undef x
++	BTREE_ID_NR
++};
++
++#define BTREE_MAX_DEPTH		4U
++
++/* Btree nodes */
++
++/*
++ * Btree nodes
++ *
++ * On disk a btree node is a list/log of these; within each set the keys are
++ * sorted
++ */
++struct bset {
++	__le64			seq;
++
++	/*
++	 * Highest journal entry this bset contains keys for.
++	 * If on recovery we don't see that journal entry, this bset is ignored:
++	 * this allows us to preserve the order of all index updates after a
++	 * crash, since the journal records a total order of all index updates
++	 * and anything that didn't make it to the journal doesn't get used.
++	 */
++	__le64			journal_seq;
++
++	__le32			flags;
++	__le16			version;
++	__le16			u64s; /* count of d[] in u64s */
++
++	union {
++		struct bkey_packed start[0];
++		__u64		_data[0];
++	};
++} __attribute__((packed, aligned(8)));
++
++LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
++
++LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
++LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
++				struct bset, flags, 5, 6);
++
++/* Sector offset within the btree node: */
++LE32_BITMASK(BSET_OFFSET,	struct bset, flags, 16, 32);
++
++struct btree_node {
++	struct bch_csum		csum;
++	__le64			magic;
++
++	/* this flags field is encrypted, unlike bset->flags: */
++	__le64			flags;
++
++	/* Closed interval: */
++	struct bpos		min_key;
++	struct bpos		max_key;
++	struct bch_extent_ptr	_ptr; /* not used anymore */
++	struct bkey_format	format;
++
++	union {
++	struct bset		keys;
++	struct {
++		__u8		pad[22];
++		__le16		u64s;
++		__u64		_data[0];
++
++	};
++	};
++} __attribute__((packed, aligned(8)));
++
++LE64_BITMASK(BTREE_NODE_ID,	struct btree_node, flags,  0,  4);
++LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
++LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
++				struct btree_node, flags,  8,  9);
++/* 9-32 unused */
++LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
++
++struct btree_node_entry {
++	struct bch_csum		csum;
++
++	union {
++	struct bset		keys;
++	struct {
++		__u8		pad[22];
++		__le16		u64s;
++		__u64		_data[0];
++
++	};
++	};
++} __attribute__((packed, aligned(8)));
++
++#endif /* _BCACHEFS_FORMAT_H */
+diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
+new file mode 100644
+index 000000000000..b2edabf58260
+--- /dev/null
++++ b/fs/bcachefs/bcachefs_ioctl.h
+@@ -0,0 +1,368 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_IOCTL_H
++#define _BCACHEFS_IOCTL_H
++
++#include <linux/uuid.h>
++#include <asm/ioctl.h>
++#include "bcachefs_format.h"
++
++/*
++ * Flags common to multiple ioctls:
++ */
++#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
++#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
++#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
++#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
++
++#define BCH_FORCE_IF_LOST			\
++	(BCH_FORCE_IF_DATA_LOST|		\
++	 BCH_FORCE_IF_METADATA_LOST)
++#define BCH_FORCE_IF_DEGRADED			\
++	(BCH_FORCE_IF_DATA_DEGRADED|		\
++	 BCH_FORCE_IF_METADATA_DEGRADED)
++
++/*
++ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
++ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
++ * filesystem:
++ */
++#define BCH_BY_INDEX			(1 << 4)
++
++/*
++ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
++ * wide superblock:
++ */
++#define BCH_READ_DEV			(1 << 5)
++
++/* global control dev: */
++
++/* These are currently broken, and probably unnecessary: */
++#if 0
++#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
++#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
++
++struct bch_ioctl_assemble {
++	__u32			flags;
++	__u32			nr_devs;
++	__u64			pad;
++	__u64			devs[];
++};
++
++struct bch_ioctl_incremental {
++	__u32			flags;
++	__u64			pad;
++	__u64			dev;
++};
++#endif
++
++/* filesystem ioctls: */
++
++#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
++
++/* These only make sense when we also have incremental assembly */
++#if 0
++#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
++#define BCH_IOCTL_STOP		_IO(0xbc,	3)
++#endif
++
++#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
++#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
++#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
++#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
++#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
++#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
++#define BCH_IOCTL_FS_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_fs_usage)
++#define BCH_IOCTL_DEV_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_dev_usage)
++#define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
++#define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
++#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
++#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
++
++#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,	16,  struct bch_ioctl_subvolume)
++#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc,	17,  struct bch_ioctl_subvolume)
++
++/* ioctl below act on a particular file, not the filesystem as a whole: */
++
++#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 64, const char __user *)
++
++/*
++ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
++ *
++ * Returns user visible UUID, not internal UUID (which may not ever be changed);
++ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
++ * this UUID.
++ */
++struct bch_ioctl_query_uuid {
++	uuid_le			uuid;
++};
++
++#if 0
++struct bch_ioctl_start {
++	__u32			flags;
++	__u32			pad;
++};
++#endif
++
++/*
++ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
++ *
++ * The specified device must not be open or in use. On success, the new device
++ * will be an online member of the filesystem just like any other member.
++ *
++ * The device must first be prepared by userspace by formatting with a bcachefs
++ * superblock, which is only used for passing in superblock options/parameters
++ * for that device (in struct bch_member). The new device's superblock should
++ * not claim to be a member of any existing filesystem - UUIDs on it will be
++ * ignored.
++ */
++
++/*
++ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
++ *
++ * Any data present on @dev will be permanently deleted, and @dev will be
++ * removed from its slot in the filesystem's list of member devices. The device
++ * may be either offline or offline.
++ *
++ * Will fail removing @dev would leave us with insufficient read write devices
++ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
++ * set.
++ */
++
++/*
++ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
++ * but is not open (e.g. because we started in degraded mode), bring it online
++ *
++ * all existing data on @dev will be available once the device is online,
++ * exactly as if @dev was present when the filesystem was first mounted
++ */
++
++/*
++ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
++ * block device, without removing it from the filesystem (so it can be brought
++ * back online later)
++ *
++ * Data present on @dev will be unavailable while @dev is offline (unless
++ * replicated), but will still be intact and untouched if @dev is brought back
++ * online
++ *
++ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
++ * leave us with insufficient read write devices or degraded/unavailable data,
++ * unless the approprate BCH_FORCE_IF_* flags are set.
++ */
++
++struct bch_ioctl_disk {
++	__u32			flags;
++	__u32			pad;
++	__u64			dev;
++};
++
++/*
++ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
++ *
++ * @new_state		- one of the bch_member_state states (rw, ro, failed,
++ *			  spare)
++ *
++ * Will refuse to change member state if we would then have insufficient devices
++ * to write to, or if it would result in degraded data (when @new_state is
++ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
++ */
++struct bch_ioctl_disk_set_state {
++	__u32			flags;
++	__u8			new_state;
++	__u8			pad[3];
++	__u64			dev;
++};
++
++enum bch_data_ops {
++	BCH_DATA_OP_SCRUB		= 0,
++	BCH_DATA_OP_REREPLICATE		= 1,
++	BCH_DATA_OP_MIGRATE		= 2,
++	BCH_DATA_OP_REWRITE_OLD_NODES	= 3,
++	BCH_DATA_OP_NR			= 4,
++};
++
++/*
++ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
++ * scrub, rereplicate, migrate).
++ *
++ * This ioctl kicks off a job in the background, and returns a file descriptor.
++ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
++ * indicating current progress, and closing the file descriptor will stop the
++ * job. The file descriptor is O_CLOEXEC.
++ */
++struct bch_ioctl_data {
++	__u16			op;
++	__u8			start_btree;
++	__u8			end_btree;
++	__u32			flags;
++
++	struct bpos		start_pos;
++	struct bpos		end_pos;
++
++	union {
++	struct {
++		__u32		dev;
++		__u32		pad;
++	}			migrate;
++	struct {
++		__u64		pad[8];
++	};
++	};
++} __attribute__((packed, aligned(8)));
++
++enum bch_data_event {
++	BCH_DATA_EVENT_PROGRESS	= 0,
++	/* XXX: add an event for reporting errors */
++	BCH_DATA_EVENT_NR	= 1,
++};
++
++struct bch_ioctl_data_progress {
++	__u8			data_type;
++	__u8			btree_id;
++	__u8			pad[2];
++	struct bpos		pos;
++
++	__u64			sectors_done;
++	__u64			sectors_total;
++} __attribute__((packed, aligned(8)));
++
++struct bch_ioctl_data_event {
++	__u8			type;
++	__u8			pad[7];
++	union {
++	struct bch_ioctl_data_progress p;
++	__u64			pad2[15];
++	};
++} __attribute__((packed, aligned(8)));
++
++struct bch_replicas_usage {
++	__u64			sectors;
++	struct bch_replicas_entry r;
++} __attribute__((packed));
++
++static inline struct bch_replicas_usage *
++replicas_usage_next(struct bch_replicas_usage *u)
++{
++	return (void *) u + replicas_entry_bytes(&u->r) + 8;
++}
++
++/*
++ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
++ *
++ * Returns disk space usage broken out by data type, number of replicas, and
++ * by component device
++ *
++ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
++ *
++ * On success, @replica_entries_bytes will be changed to indicate the number of
++ * bytes actually used.
++ *
++ * Returns -ERANGE if @replica_entries_bytes was too small
++ */
++struct bch_ioctl_fs_usage {
++	__u64			capacity;
++	__u64			used;
++	__u64			online_reserved;
++	__u64			persistent_reserved[BCH_REPLICAS_MAX];
++
++	__u32			replica_entries_bytes;
++	__u32			pad;
++
++	struct bch_replicas_usage replicas[0];
++};
++
++/*
++ * BCH_IOCTL_DEV_USAGE: query device disk space usage
++ *
++ * Returns disk space usage broken out by data type - both by buckets and
++ * sectors.
++ */
++struct bch_ioctl_dev_usage {
++	__u64			dev;
++	__u32			flags;
++	__u8			state;
++	__u8			pad[7];
++
++	__u32			bucket_size;
++	__u64			nr_buckets;
++
++	__u64			buckets_ec;
++
++	struct bch_ioctl_dev_usage_type {
++		__u64		buckets;
++		__u64		sectors;
++		__u64		fragmented;
++	}			d[BCH_DATA_NR];
++};
++
++/*
++ * BCH_IOCTL_READ_SUPER: read filesystem superblock
++ *
++ * Equivalent to reading the superblock directly from the block device, except
++ * avoids racing with the kernel writing the superblock or having to figure out
++ * which block device to read
++ *
++ * @sb		- buffer to read into
++ * @size	- size of userspace allocated buffer
++ * @dev		- device to read superblock for, if BCH_READ_DEV flag is
++ *		  specified
++ *
++ * Returns -ERANGE if buffer provided is too small
++ */
++struct bch_ioctl_read_super {
++	__u32			flags;
++	__u32			pad;
++	__u64			dev;
++	__u64			size;
++	__u64			sb;
++};
++
++/*
++ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
++ * determine if disk is a (online) member - if so, returns device's index
++ *
++ * Returns -ENOENT if not found
++ */
++struct bch_ioctl_disk_get_idx {
++	__u64			dev;
++};
++
++/*
++ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
++ *
++ * @dev		- member to resize
++ * @nbuckets	- new number of buckets
++ */
++struct bch_ioctl_disk_resize {
++	__u32			flags;
++	__u32			pad;
++	__u64			dev;
++	__u64			nbuckets;
++};
++
++/*
++ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
++ *
++ * @dev		- member to resize
++ * @nbuckets	- new number of buckets
++ */
++struct bch_ioctl_disk_resize_journal {
++	__u32			flags;
++	__u32			pad;
++	__u64			dev;
++	__u64			nbuckets;
++};
++
++struct bch_ioctl_subvolume {
++	__u32			flags;
++	__u32			dirfd;
++	__u16			mode;
++	__u16			pad[3];
++	__u64			dst_ptr;
++	__u64			src_ptr;
++};
++
++#define BCH_SUBVOL_SNAPSHOT_CREATE	(1U << 0)
++#define BCH_SUBVOL_SNAPSHOT_RO		(1U << 1)
++
++#endif /* _BCACHEFS_IOCTL_H */
+diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
+new file mode 100644
+index 000000000000..cc0689635164
+--- /dev/null
++++ b/fs/bcachefs/bkey.c
+@@ -0,0 +1,1175 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "bkey.h"
++#include "bkey_methods.h"
++#include "bset.h"
++#include "util.h"
++
++#undef EBUG_ON
++
++#ifdef DEBUG_BKEYS
++#define EBUG_ON(cond)		BUG_ON(cond)
++#else
++#define EBUG_ON(cond)
++#endif
++
++const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
++
++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
++			      const struct bkey_packed *);
++
++void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
++{
++	unsigned bit = high_bit_offset, done = 0;
++
++	while (1) {
++		while (bit < 64) {
++			if (done && !(done % 8))
++				*out++ = ' ';
++			*out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
++			bit++;
++			done++;
++			if (done == nr_bits) {
++				*out++ = '\0';
++				return;
++			}
++		}
++
++		p = next_word(p);
++		bit = 0;
++	}
++}
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++
++static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
++				 const struct bkey *unpacked,
++				 const struct bkey_format *format)
++{
++	struct bkey tmp;
++
++	BUG_ON(bkeyp_val_u64s(format, packed) !=
++	       bkey_val_u64s(unpacked));
++
++	BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
++
++	tmp = __bch2_bkey_unpack_key(format, packed);
++
++	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
++		struct printbuf buf1 = PRINTBUF;
++		struct printbuf buf2 = PRINTBUF;
++		char buf3[160], buf4[160];
++
++		bch2_bkey_to_text(&buf1, unpacked);
++		bch2_bkey_to_text(&buf2, &tmp);
++		bch2_to_binary(buf3, (void *) unpacked, 80);
++		bch2_to_binary(buf4, high_word(format, packed), 80);
++
++		panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
++		      format->key_u64s,
++		      format->bits_per_field[0],
++		      format->bits_per_field[1],
++		      format->bits_per_field[2],
++		      format->bits_per_field[3],
++		      format->bits_per_field[4],
++		      buf1.buf, buf2.buf, buf3, buf4);
++	}
++}
++
++#else
++static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
++					const struct bkey *unpacked,
++					const struct bkey_format *format) {}
++#endif
++
++struct pack_state {
++	const struct bkey_format *format;
++	unsigned		bits;	/* bits remaining in current word */
++	u64			w;	/* current word */
++	u64			*p;	/* pointer to next word */
++};
++
++__always_inline
++static struct pack_state pack_state_init(const struct bkey_format *format,
++					 struct bkey_packed *k)
++{
++	u64 *p = high_word(format, k);
++
++	return (struct pack_state) {
++		.format	= format,
++		.bits	= 64 - high_bit_offset,
++		.w	= 0,
++		.p	= p,
++	};
++}
++
++__always_inline
++static void pack_state_finish(struct pack_state *state,
++			      struct bkey_packed *k)
++{
++	EBUG_ON(state->p <  k->_data);
++	EBUG_ON(state->p >= k->_data + state->format->key_u64s);
++
++	*state->p = state->w;
++}
++
++struct unpack_state {
++	const struct bkey_format *format;
++	unsigned		bits;	/* bits remaining in current word */
++	u64			w;	/* current word */
++	const u64		*p;	/* pointer to next word */
++};
++
++__always_inline
++static struct unpack_state unpack_state_init(const struct bkey_format *format,
++					     const struct bkey_packed *k)
++{
++	const u64 *p = high_word(format, k);
++
++	return (struct unpack_state) {
++		.format	= format,
++		.bits	= 64 - high_bit_offset,
++		.w	= *p << high_bit_offset,
++		.p	= p,
++	};
++}
++
++__always_inline
++static u64 get_inc_field(struct unpack_state *state, unsigned field)
++{
++	unsigned bits = state->format->bits_per_field[field];
++	u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
++
++	if (bits >= state->bits) {
++		v = state->w >> (64 - bits);
++		bits -= state->bits;
++
++		state->p = next_word(state->p);
++		state->w = *state->p;
++		state->bits = 64;
++	}
++
++	/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
++	v |= (state->w >> 1) >> (63 - bits);
++	state->w <<= bits;
++	state->bits -= bits;
++
++	return v + offset;
++}
++
++__always_inline
++static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
++{
++	unsigned bits = state->format->bits_per_field[field];
++	u64 offset = le64_to_cpu(state->format->field_offset[field]);
++
++	if (v < offset)
++		return false;
++
++	v -= offset;
++
++	if (fls64(v) > bits)
++		return false;
++
++	if (bits > state->bits) {
++		bits -= state->bits;
++		/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
++		state->w |= (v >> 1) >> (bits - 1);
++
++		*state->p = state->w;
++		state->p = next_word(state->p);
++		state->w = 0;
++		state->bits = 64;
++	}
++
++	state->bits -= bits;
++	state->w |= v << state->bits;
++
++	return true;
++}
++
++/*
++ * Note: does NOT set out->format (we don't know what it should be here!)
++ *
++ * Also: doesn't work on extents - it doesn't preserve the invariant that
++ * if k is packed bkey_start_pos(k) will successfully pack
++ */
++static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
++				   struct bkey_packed *out,
++				   const struct bkey_format *in_f,
++				   const struct bkey_packed *in)
++{
++	struct pack_state out_s = pack_state_init(out_f, out);
++	struct unpack_state in_s = unpack_state_init(in_f, in);
++	u64 *w = out->_data;
++	unsigned i;
++
++	*w = 0;
++
++	for (i = 0; i < BKEY_NR_FIELDS; i++)
++		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
++			return false;
++
++	/* Can't happen because the val would be too big to unpack: */
++	EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
++
++	pack_state_finish(&out_s, out);
++	out->u64s	= out_f->key_u64s + in->u64s - in_f->key_u64s;
++	out->needs_whiteout = in->needs_whiteout;
++	out->type	= in->type;
++
++	return true;
++}
++
++bool bch2_bkey_transform(const struct bkey_format *out_f,
++			struct bkey_packed *out,
++			const struct bkey_format *in_f,
++			const struct bkey_packed *in)
++{
++	if (!bch2_bkey_transform_key(out_f, out, in_f, in))
++		return false;
++
++	memcpy_u64s((u64 *) out + out_f->key_u64s,
++		    (u64 *) in + in_f->key_u64s,
++		    (in->u64s - in_f->key_u64s));
++	return true;
++}
++
++#define bkey_fields()							\
++	x(BKEY_FIELD_INODE,		p.inode)			\
++	x(BKEY_FIELD_OFFSET,		p.offset)			\
++	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
++	x(BKEY_FIELD_SIZE,		size)				\
++	x(BKEY_FIELD_VERSION_HI,	version.hi)			\
++	x(BKEY_FIELD_VERSION_LO,	version.lo)
++
++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
++			      const struct bkey_packed *in)
++{
++	struct unpack_state state = unpack_state_init(format, in);
++	struct bkey out;
++
++	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
++	EBUG_ON(in->u64s < format->key_u64s);
++	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
++	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
++
++	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
++	out.format	= KEY_FORMAT_CURRENT;
++	out.needs_whiteout = in->needs_whiteout;
++	out.type	= in->type;
++	out.pad[0]	= 0;
++
++#define x(id, field)	out.field = get_inc_field(&state, id);
++	bkey_fields()
++#undef x
++
++	return out;
++}
++
++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
++struct bpos __bkey_unpack_pos(const struct bkey_format *format,
++				     const struct bkey_packed *in)
++{
++	struct unpack_state state = unpack_state_init(format, in);
++	struct bpos out;
++
++	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
++	EBUG_ON(in->u64s < format->key_u64s);
++	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
++
++	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
++	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
++	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
++
++	return out;
++}
++#endif
++
++/**
++ * bch2_bkey_pack_key -- pack just the key, not the value
++ */
++bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
++		   const struct bkey_format *format)
++{
++	struct pack_state state = pack_state_init(format, out);
++	u64 *w = out->_data;
++
++	EBUG_ON((void *) in == (void *) out);
++	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
++	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
++
++	*w = 0;
++
++#define x(id, field)	if (!set_inc_field(&state, id, in->field)) return false;
++	bkey_fields()
++#undef x
++
++	/*
++	 * Extents - we have to guarantee that if an extent is packed, a trimmed
++	 * version will also pack:
++	 */
++	if (bkey_start_offset(in) <
++	    le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
++		return false;
++
++	pack_state_finish(&state, out);
++	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
++	out->format	= KEY_FORMAT_LOCAL_BTREE;
++	out->needs_whiteout = in->needs_whiteout;
++	out->type	= in->type;
++
++	bch2_bkey_pack_verify(out, in, format);
++	return true;
++}
++
++/**
++ * bch2_bkey_unpack -- unpack the key and the value
++ */
++void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
++		 const struct bkey_packed *src)
++{
++	__bkey_unpack_key(b, &dst->k, src);
++
++	memcpy_u64s(&dst->v,
++		    bkeyp_val(&b->format, src),
++		    bkeyp_val_u64s(&b->format, src));
++}
++
++/**
++ * bch2_bkey_pack -- pack the key and the value
++ */
++bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
++	       const struct bkey_format *format)
++{
++	struct bkey_packed tmp;
++
++	if (!bch2_bkey_pack_key(&tmp, &in->k, format))
++		return false;
++
++	memmove_u64s((u64 *) out + format->key_u64s,
++		     &in->v,
++		     bkey_val_u64s(&in->k));
++	memcpy_u64s(out, &tmp, format->key_u64s);
++
++	return true;
++}
++
++__always_inline
++static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
++{
++	unsigned bits = state->format->bits_per_field[field];
++	u64 offset = le64_to_cpu(state->format->field_offset[field]);
++	bool ret = true;
++
++	EBUG_ON(v < offset);
++	v -= offset;
++
++	if (fls64(v) > bits) {
++		v = ~(~0ULL << bits);
++		ret = false;
++	}
++
++	if (bits > state->bits) {
++		bits -= state->bits;
++		state->w |= (v >> 1) >> (bits - 1);
++
++		*state->p = state->w;
++		state->p = next_word(state->p);
++		state->w = 0;
++		state->bits = 64;
++	}
++
++	state->bits -= bits;
++	state->w |= v << state->bits;
++
++	return ret;
++}
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++static bool bkey_packed_successor(struct bkey_packed *out,
++				  const struct btree *b,
++				  struct bkey_packed k)
++{
++	const struct bkey_format *f = &b->format;
++	unsigned nr_key_bits = b->nr_key_bits;
++	unsigned first_bit, offset;
++	u64 *p;
++
++	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
++
++	if (!nr_key_bits)
++		return false;
++
++	*out = k;
++
++	first_bit = high_bit_offset + nr_key_bits - 1;
++	p = nth_word(high_word(f, out), first_bit >> 6);
++	offset = 63 - (first_bit & 63);
++
++	while (nr_key_bits) {
++		unsigned bits = min(64 - offset, nr_key_bits);
++		u64 mask = (~0ULL >> (64 - bits)) << offset;
++
++		if ((*p & mask) != mask) {
++			*p += 1ULL << offset;
++			EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
++			return true;
++		}
++
++		*p &= ~mask;
++		p = prev_word(p);
++		nr_key_bits -= bits;
++		offset = 0;
++	}
++
++	return false;
++}
++#endif
++
++/*
++ * Returns a packed key that compares <= in
++ *
++ * This is used in bset_search_tree(), where we need a packed pos in order to be
++ * able to compare against the keys in the auxiliary search tree - and it's
++ * legal to use a packed pos that isn't equivalent to the original pos,
++ * _provided_ it compares <= to the original pos.
++ */
++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
++					   struct bpos in,
++					   const struct btree *b)
++{
++	const struct bkey_format *f = &b->format;
++	struct pack_state state = pack_state_init(f, out);
++	u64 *w = out->_data;
++#ifdef CONFIG_BCACHEFS_DEBUG
++	struct bpos orig = in;
++#endif
++	bool exact = true;
++	unsigned i;
++
++	/*
++	 * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
++	 * byte header, but pack_pos() won't if the len/version fields are big
++	 * enough - we need to make sure to zero them out:
++	 */
++	for (i = 0; i < f->key_u64s; i++)
++		w[i] = 0;
++
++	if (unlikely(in.snapshot <
++		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
++		if (!in.offset-- &&
++		    !in.inode--)
++			return BKEY_PACK_POS_FAIL;
++		in.snapshot	= KEY_SNAPSHOT_MAX;
++		exact = false;
++	}
++
++	if (unlikely(in.offset <
++		     le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
++		if (!in.inode--)
++			return BKEY_PACK_POS_FAIL;
++		in.offset	= KEY_OFFSET_MAX;
++		in.snapshot	= KEY_SNAPSHOT_MAX;
++		exact = false;
++	}
++
++	if (unlikely(in.inode <
++		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
++		return BKEY_PACK_POS_FAIL;
++
++	if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
++		in.offset	= KEY_OFFSET_MAX;
++		in.snapshot	= KEY_SNAPSHOT_MAX;
++		exact = false;
++	}
++
++	if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
++		in.snapshot	= KEY_SNAPSHOT_MAX;
++		exact = false;
++	}
++
++	if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
++		exact = false;
++
++	pack_state_finish(&state, out);
++	out->u64s	= f->key_u64s;
++	out->format	= KEY_FORMAT_LOCAL_BTREE;
++	out->type	= KEY_TYPE_deleted;
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++	if (exact) {
++		BUG_ON(bkey_cmp_left_packed(b, out, &orig));
++	} else {
++		struct bkey_packed successor;
++
++		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
++		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
++		       bkey_cmp_left_packed(b, &successor, &orig) < 0);
++	}
++#endif
++
++	return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
++}
++
++void bch2_bkey_format_init(struct bkey_format_state *s)
++{
++	unsigned i;
++
++	for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
++		s->field_min[i] = U64_MAX;
++
++	for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
++		s->field_max[i] = 0;
++
++	/* Make sure we can store a size of 0: */
++	s->field_min[BKEY_FIELD_SIZE] = 0;
++}
++
++static void __bkey_format_add(struct bkey_format_state *s,
++			      unsigned field, u64 v)
++{
++	s->field_min[field] = min(s->field_min[field], v);
++	s->field_max[field] = max(s->field_max[field], v);
++}
++
++/*
++ * Changes @format so that @k can be successfully packed with @format
++ */
++void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
++{
++#define x(id, field) __bkey_format_add(s, id, k->field);
++	bkey_fields()
++#undef x
++	__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
++}
++
++void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
++{
++	unsigned field = 0;
++
++	__bkey_format_add(s, field++, p.inode);
++	__bkey_format_add(s, field++, p.offset);
++	__bkey_format_add(s, field++, p.snapshot);
++}
++
++/*
++ * We don't want it to be possible for the packed format to represent fields
++ * bigger than a u64... that will cause confusion and issues (like with
++ * bkey_packed_successor())
++ */
++static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
++			     unsigned bits, u64 offset)
++{
++	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
++	u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
++
++	bits = min(bits, unpacked_bits);
++
++	offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
++
++	f->bits_per_field[i]	= bits;
++	f->field_offset[i]	= cpu_to_le64(offset);
++}
++
++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
++{
++	unsigned i, bits = KEY_PACKED_BITS_START;
++	struct bkey_format ret = {
++		.nr_fields = BKEY_NR_FIELDS,
++	};
++
++	for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
++		s->field_min[i] = min(s->field_min[i], s->field_max[i]);
++
++		set_format_field(&ret, i,
++				 fls64(s->field_max[i] - s->field_min[i]),
++				 s->field_min[i]);
++
++		bits += ret.bits_per_field[i];
++	}
++
++	/* allow for extent merging: */
++	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
++		ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
++		bits += 4;
++	}
++
++	ret.key_u64s = DIV_ROUND_UP(bits, 64);
++
++	/* if we have enough spare bits, round fields up to nearest byte */
++	bits = ret.key_u64s * 64 - bits;
++
++	for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
++		unsigned r = round_up(ret.bits_per_field[i], 8) -
++			ret.bits_per_field[i];
++
++		if (r <= bits) {
++			set_format_field(&ret, i,
++					 ret.bits_per_field[i] + r,
++					 le64_to_cpu(ret.field_offset[i]));
++			bits -= r;
++		}
++	}
++
++	EBUG_ON(bch2_bkey_format_validate(&ret));
++	return ret;
++}
++
++const char *bch2_bkey_format_validate(struct bkey_format *f)
++{
++	unsigned i, bits = KEY_PACKED_BITS_START;
++
++	if (f->nr_fields != BKEY_NR_FIELDS)
++		return "incorrect number of fields";
++
++	/*
++	 * Verify that the packed format can't represent fields larger than the
++	 * unpacked format:
++	 */
++	for (i = 0; i < f->nr_fields; i++) {
++		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
++		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
++		u64 packed_max = f->bits_per_field[i]
++			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
++			: 0;
++		u64 field_offset = le64_to_cpu(f->field_offset[i]);
++
++		if (packed_max + field_offset < packed_max ||
++		    packed_max + field_offset > unpacked_max)
++			return "field too large";
++
++		bits += f->bits_per_field[i];
++	}
++
++	if (f->key_u64s != DIV_ROUND_UP(bits, 64))
++		return "incorrect key_u64s";
++
++	return NULL;
++}
++
++/*
++ * Most significant differing bit
++ * Bits are indexed from 0 - return is [0, nr_key_bits)
++ */
++__pure
++unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
++					  const struct bkey_packed *l_k,
++					  const struct bkey_packed *r_k)
++{
++	const u64 *l = high_word(&b->format, l_k);
++	const u64 *r = high_word(&b->format, r_k);
++	unsigned nr_key_bits = b->nr_key_bits;
++	unsigned word_bits = 64 - high_bit_offset;
++	u64 l_v, r_v;
++
++	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
++
++	/* for big endian, skip past header */
++	l_v = *l & (~0ULL >> high_bit_offset);
++	r_v = *r & (~0ULL >> high_bit_offset);
++
++	while (nr_key_bits) {
++		if (nr_key_bits < word_bits) {
++			l_v >>= word_bits - nr_key_bits;
++			r_v >>= word_bits - nr_key_bits;
++			nr_key_bits = 0;
++		} else {
++			nr_key_bits -= word_bits;
++		}
++
++		if (l_v != r_v)
++			return fls64(l_v ^ r_v) - 1 + nr_key_bits;
++
++		l = next_word(l);
++		r = next_word(r);
++
++		l_v = *l;
++		r_v = *r;
++		word_bits = 64;
++	}
++
++	return 0;
++}
++
++/*
++ * First set bit
++ * Bits are indexed from 0 - return is [0, nr_key_bits)
++ */
++__pure
++unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
++{
++	const u64 *p = high_word(&b->format, k);
++	unsigned nr_key_bits = b->nr_key_bits;
++	unsigned ret = 0, offset;
++
++	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
++
++	offset = nr_key_bits;
++	while (offset > 64) {
++		p = next_word(p);
++		offset -= 64;
++	}
++
++	offset = 64 - offset;
++
++	while (nr_key_bits) {
++		unsigned bits = nr_key_bits + offset < 64
++			? nr_key_bits
++			: 64 - offset;
++
++		u64 mask = (~0ULL >> (64 - bits)) << offset;
++
++		if (*p & mask)
++			return ret + __ffs64(*p & mask) - offset;
++
++		p = prev_word(p);
++		nr_key_bits -= bits;
++		ret += bits;
++		offset = 0;
++	}
++
++	return 0;
++}
++
++#ifdef CONFIG_X86_64
++
++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
++				  unsigned nr_key_bits)
++{
++	long d0, d1, d2, d3;
++	int cmp;
++
++	/* we shouldn't need asm for this, but gcc is being retarded: */
++
++	asm(".intel_syntax noprefix;"
++	    "xor eax, eax;"
++	    "xor edx, edx;"
++	    "1:;"
++	    "mov r8, [rdi];"
++	    "mov r9, [rsi];"
++	    "sub ecx, 64;"
++	    "jl 2f;"
++
++	    "cmp r8, r9;"
++	    "jnz 3f;"
++
++	    "lea rdi, [rdi - 8];"
++	    "lea rsi, [rsi - 8];"
++	    "jmp 1b;"
++
++	    "2:;"
++	    "not ecx;"
++	    "shr r8, 1;"
++	    "shr r9, 1;"
++	    "shr r8, cl;"
++	    "shr r9, cl;"
++	    "cmp r8, r9;"
++
++	    "3:\n"
++	    "seta al;"
++	    "setb dl;"
++	    "sub eax, edx;"
++	    ".att_syntax prefix;"
++	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
++	    : "0" (l), "1" (r), "3" (nr_key_bits)
++	    : "r8", "r9", "cc", "memory");
++
++	return cmp;
++}
++
++#define I(_x)			(*(out)++ = (_x))
++#define I1(i0)						I(i0)
++#define I2(i0, i1)		(I1(i0),		I(i1))
++#define I3(i0, i1, i2)		(I2(i0, i1),		I(i2))
++#define I4(i0, i1, i2, i3)	(I3(i0, i1, i2),	I(i3))
++#define I5(i0, i1, i2, i3, i4)	(I4(i0, i1, i2, i3),	I(i4))
++
++static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
++			      enum bch_bkey_fields field,
++			      unsigned dst_offset, unsigned dst_size,
++			      bool *eax_zeroed)
++{
++	unsigned bits = format->bits_per_field[field];
++	u64 offset = le64_to_cpu(format->field_offset[field]);
++	unsigned i, byte, bit_offset, align, shl, shr;
++
++	if (!bits && !offset) {
++		if (!*eax_zeroed) {
++			/* xor eax, eax */
++			I2(0x31, 0xc0);
++		}
++
++		*eax_zeroed = true;
++		goto set_field;
++	}
++
++	if (!bits) {
++		/* just return offset: */
++
++		switch (dst_size) {
++		case 8:
++			if (offset > S32_MAX) {
++				/* mov [rdi + dst_offset], offset */
++				I3(0xc7, 0x47, dst_offset);
++				memcpy(out, &offset, 4);
++				out += 4;
++
++				I3(0xc7, 0x47, dst_offset + 4);
++				memcpy(out, (void *) &offset + 4, 4);
++				out += 4;
++			} else {
++				/* mov [rdi + dst_offset], offset */
++				/* sign extended */
++				I4(0x48, 0xc7, 0x47, dst_offset);
++				memcpy(out, &offset, 4);
++				out += 4;
++			}
++			break;
++		case 4:
++			/* mov [rdi + dst_offset], offset */
++			I3(0xc7, 0x47, dst_offset);
++			memcpy(out, &offset, 4);
++			out += 4;
++			break;
++		default:
++			BUG();
++		}
++
++		return out;
++	}
++
++	bit_offset = format->key_u64s * 64;
++	for (i = 0; i <= field; i++)
++		bit_offset -= format->bits_per_field[i];
++
++	byte = bit_offset / 8;
++	bit_offset -= byte * 8;
++
++	*eax_zeroed = false;
++
++	if (bit_offset == 0 && bits == 8) {
++		/* movzx eax, BYTE PTR [rsi + imm8] */
++		I4(0x0f, 0xb6, 0x46, byte);
++	} else if (bit_offset == 0 && bits == 16) {
++		/* movzx eax, WORD PTR [rsi + imm8] */
++		I4(0x0f, 0xb7, 0x46, byte);
++	} else if (bit_offset + bits <= 32) {
++		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
++		byte -= align;
++		bit_offset += align * 8;
++
++		BUG_ON(bit_offset + bits > 32);
++
++		/* mov eax, [rsi + imm8] */
++		I3(0x8b, 0x46, byte);
++
++		if (bit_offset) {
++			/* shr eax, imm8 */
++			I3(0xc1, 0xe8, bit_offset);
++		}
++
++		if (bit_offset + bits < 32) {
++			unsigned mask = ~0U >> (32 - bits);
++
++			/* and eax, imm32 */
++			I1(0x25);
++			memcpy(out, &mask, 4);
++			out += 4;
++		}
++	} else if (bit_offset + bits <= 64) {
++		align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
++		byte -= align;
++		bit_offset += align * 8;
++
++		BUG_ON(bit_offset + bits > 64);
++
++		/* mov rax, [rsi + imm8] */
++		I4(0x48, 0x8b, 0x46, byte);
++
++		shl = 64 - bit_offset - bits;
++		shr = bit_offset + shl;
++
++		if (shl) {
++			/* shl rax, imm8 */
++			I4(0x48, 0xc1, 0xe0, shl);
++		}
++
++		if (shr) {
++			/* shr rax, imm8 */
++			I4(0x48, 0xc1, 0xe8, shr);
++		}
++	} else {
++		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
++		byte -= align;
++		bit_offset += align * 8;
++
++		BUG_ON(bit_offset + bits > 96);
++
++		/* mov rax, [rsi + byte] */
++		I4(0x48, 0x8b, 0x46, byte);
++
++		/* mov edx, [rsi + byte + 8] */
++		I3(0x8b, 0x56, byte + 8);
++
++		/* bits from next word: */
++		shr = bit_offset + bits - 64;
++		BUG_ON(shr > bit_offset);
++
++		/* shr rax, bit_offset */
++		I4(0x48, 0xc1, 0xe8, shr);
++
++		/* shl rdx, imm8 */
++		I4(0x48, 0xc1, 0xe2, 64 - shr);
++
++		/* or rax, rdx */
++		I3(0x48, 0x09, 0xd0);
++
++		shr = bit_offset - shr;
++
++		if (shr) {
++			/* shr rax, imm8 */
++			I4(0x48, 0xc1, 0xe8, shr);
++		}
++	}
++
++	/* rax += offset: */
++	if (offset > S32_MAX) {
++		/* mov rdx, imm64 */
++		I2(0x48, 0xba);
++		memcpy(out, &offset, 8);
++		out += 8;
++		/* add %rdx, %rax */
++		I3(0x48, 0x01, 0xd0);
++	} else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
++		/* add rax, imm32 */
++		I2(0x48, 0x05);
++		memcpy(out, &offset, 4);
++		out += 4;
++	} else if (offset) {
++		/* add eax, imm32 */
++		I1(0x05);
++		memcpy(out, &offset, 4);
++		out += 4;
++	}
++set_field:
++	switch (dst_size) {
++	case 8:
++		/* mov [rdi + dst_offset], rax */
++		I4(0x48, 0x89, 0x47, dst_offset);
++		break;
++	case 4:
++		/* mov [rdi + dst_offset], eax */
++		I3(0x89, 0x47, dst_offset);
++		break;
++	default:
++		BUG();
++	}
++
++	return out;
++}
++
++int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
++{
++	bool eax_zeroed = false;
++	u8 *out = _out;
++
++	/*
++	 * rdi: dst - unpacked key
++	 * rsi: src - packed key
++	 */
++
++	/* k->u64s, k->format, k->type */
++
++	/* mov eax, [rsi] */
++	I2(0x8b, 0x06);
++
++	/* add eax, BKEY_U64s - format->key_u64s */
++	I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
++
++	/* and eax, imm32: mask out k->pad: */
++	I5(0x25, 0xff, 0xff, 0xff, 0);
++
++	/* mov [rdi], eax */
++	I2(0x89, 0x07);
++
++#define x(id, field)							\
++	out = compile_bkey_field(format, out, id,			\
++				 offsetof(struct bkey, field),		\
++				 sizeof(((struct bkey *) NULL)->field),	\
++				 &eax_zeroed);
++	bkey_fields()
++#undef x
++
++	/* retq */
++	I1(0xc3);
++
++	return (void *) out - _out;
++}
++
++#else
++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
++				  unsigned nr_key_bits)
++{
++	u64 l_v, r_v;
++
++	if (!nr_key_bits)
++		return 0;
++
++	/* for big endian, skip past header */
++	nr_key_bits += high_bit_offset;
++	l_v = *l & (~0ULL >> high_bit_offset);
++	r_v = *r & (~0ULL >> high_bit_offset);
++
++	while (1) {
++		if (nr_key_bits < 64) {
++			l_v >>= 64 - nr_key_bits;
++			r_v >>= 64 - nr_key_bits;
++			nr_key_bits = 0;
++		} else {
++			nr_key_bits -= 64;
++		}
++
++		if (!nr_key_bits || l_v != r_v)
++			break;
++
++		l = next_word(l);
++		r = next_word(r);
++
++		l_v = *l;
++		r_v = *r;
++	}
++
++	return cmp_int(l_v, r_v);
++}
++#endif
++
++__pure
++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
++					  const struct bkey_packed *r,
++					  const struct btree *b)
++{
++	const struct bkey_format *f = &b->format;
++	int ret;
++
++	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
++	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
++
++	ret = __bkey_cmp_bits(high_word(f, l),
++			      high_word(f, r),
++			      b->nr_key_bits);
++
++	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
++				bkey_unpack_pos(b, r)));
++	return ret;
++}
++
++__pure __flatten
++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
++					       const struct bkey_packed *l,
++					       const struct bpos *r)
++{
++	return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
++}
++
++__pure __flatten
++int bch2_bkey_cmp_packed(const struct btree *b,
++			 const struct bkey_packed *l,
++			 const struct bkey_packed *r)
++{
++	struct bkey unpacked;
++
++	if (likely(bkey_packed(l) && bkey_packed(r)))
++		return __bch2_bkey_cmp_packed_format_checked(l, r, b);
++
++	if (bkey_packed(l)) {
++		__bkey_unpack_key_format_checked(b, &unpacked, l);
++		l = (void*) &unpacked;
++	} else if (bkey_packed(r)) {
++		__bkey_unpack_key_format_checked(b, &unpacked, r);
++		r = (void*) &unpacked;
++	}
++
++	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
++}
++
++__pure __flatten
++int __bch2_bkey_cmp_left_packed(const struct btree *b,
++				const struct bkey_packed *l,
++				const struct bpos *r)
++{
++	const struct bkey *l_unpacked;
++
++	return unlikely(l_unpacked = packed_to_bkey_c(l))
++		? bpos_cmp(l_unpacked->p, *r)
++		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
++}
++
++void bch2_bpos_swab(struct bpos *p)
++{
++	u8 *l = (u8 *) p;
++	u8 *h = ((u8 *) &p[1]) - 1;
++
++	while (l < h) {
++		swap(*l, *h);
++		l++;
++		--h;
++	}
++}
++
++void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
++{
++	const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
++	u8 *l = k->key_start;
++	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
++
++	while (l < h) {
++		swap(*l, *h);
++		l++;
++		--h;
++	}
++}
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++void bch2_bkey_pack_test(void)
++{
++	struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
++	struct bkey_packed p;
++
++	struct bkey_format test_format = {
++		.key_u64s	= 3,
++		.nr_fields	= BKEY_NR_FIELDS,
++		.bits_per_field = {
++			13,
++			64,
++			32,
++		},
++	};
++
++	struct unpack_state in_s =
++		unpack_state_init(&bch2_bkey_format_current, (void *) &t);
++	struct pack_state out_s = pack_state_init(&test_format, &p);
++	unsigned i;
++
++	for (i = 0; i < out_s.format->nr_fields; i++) {
++		u64 a, v = get_inc_field(&in_s, i);
++
++		switch (i) {
++#define x(id, field)	case id: a = t.field; break;
++	bkey_fields()
++#undef x
++		default:
++			BUG();
++		}
++
++		if (a != v)
++			panic("got %llu actual %llu i %u\n", v, a, i);
++
++		if (!set_inc_field(&out_s, i, v))
++			panic("failed at %u\n", i);
++	}
++
++	BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
++}
++#endif
+diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
+new file mode 100644
+index 000000000000..7dee3d8e0a3d
+--- /dev/null
++++ b/fs/bcachefs/bkey.h
+@@ -0,0 +1,566 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BKEY_H
++#define _BCACHEFS_BKEY_H
++
++#include <linux/bug.h>
++#include "bcachefs_format.h"
++
++#include "util.h"
++#include "vstructs.h"
++
++#ifdef CONFIG_X86_64
++#define HAVE_BCACHEFS_COMPILED_UNPACK	1
++#endif
++
++void bch2_to_binary(char *, const u64 *, unsigned);
++
++/* bkey with split value, const */
++struct bkey_s_c {
++	const struct bkey	*k;
++	const struct bch_val	*v;
++};
++
++/* bkey with split value */
++struct bkey_s {
++	union {
++	struct {
++		struct bkey	*k;
++		struct bch_val	*v;
++	};
++	struct bkey_s_c		s_c;
++	};
++};
++
++#define bkey_next(_k)		vstruct_next(_k)
++
++#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
++
++static inline size_t bkey_val_bytes(const struct bkey *k)
++{
++	return bkey_val_u64s(k) * sizeof(u64);
++}
++
++static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
++{
++	k->u64s = BKEY_U64s + val_u64s;
++}
++
++static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
++{
++	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
++}
++
++#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
++
++#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
++
++#define bkey_whiteout(_k)				\
++	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
++
++enum bkey_lr_packed {
++	BKEY_PACKED_BOTH,
++	BKEY_PACKED_RIGHT,
++	BKEY_PACKED_LEFT,
++	BKEY_PACKED_NONE,
++};
++
++#define bkey_lr_packed(_l, _r)						\
++	((_l)->format + ((_r)->format << 1))
++
++#define bkey_copy(_dst, _src)					\
++do {								\
++	BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&		\
++		     !type_is(_dst, struct bkey_packed *));	\
++	BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&		\
++		     !type_is(_src, struct bkey_packed *));	\
++	EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&		\
++		(u64 *) (_dst) < (u64 *) (_src) +		\
++		((struct bkey *) (_src))->u64s);		\
++								\
++	memcpy_u64s_small((_dst), (_src),			\
++			  ((struct bkey *) (_src))->u64s);	\
++} while (0)
++
++struct btree;
++
++struct bkey_format_state {
++	u64 field_min[BKEY_NR_FIELDS];
++	u64 field_max[BKEY_NR_FIELDS];
++};
++
++void bch2_bkey_format_init(struct bkey_format_state *);
++void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
++void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
++const char *bch2_bkey_format_validate(struct bkey_format *);
++
++__pure
++unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
++					  const struct bkey_packed *,
++					  const struct bkey_packed *);
++__pure
++unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
++
++__pure
++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
++				     const struct bkey_packed *,
++				     const struct btree *);
++
++__pure
++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
++					  const struct bkey_packed *,
++					  const struct bpos *);
++
++__pure
++int bch2_bkey_cmp_packed(const struct btree *,
++			 const struct bkey_packed *,
++			 const struct bkey_packed *);
++
++__pure
++int __bch2_bkey_cmp_left_packed(const struct btree *,
++				const struct bkey_packed *,
++				const struct bpos *);
++
++static inline __pure
++int bkey_cmp_left_packed(const struct btree *b,
++			 const struct bkey_packed *l, const struct bpos *r)
++{
++	return __bch2_bkey_cmp_left_packed(b, l, r);
++}
++
++/*
++ * we prefer to pass bpos by ref, but it's often enough terribly convenient to
++ * pass it by by val... as much as I hate c++, const ref would be nice here:
++ */
++__pure __flatten
++static inline int bkey_cmp_left_packed_byval(const struct btree *b,
++					     const struct bkey_packed *l,
++					     struct bpos r)
++{
++	return bkey_cmp_left_packed(b, l, &r);
++}
++
++static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
++{
++	return  cmp_int(l.inode,    r.inode) ?:
++		cmp_int(l.offset,   r.offset) ?:
++		cmp_int(l.snapshot, r.snapshot);
++}
++
++static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
++{
++	return  cmp_int(l.inode,    r.inode) ?:
++		cmp_int(l.offset,   r.offset);
++}
++
++static inline struct bpos bpos_min(struct bpos l, struct bpos r)
++{
++	return bpos_cmp(l, r) < 0 ? l : r;
++}
++
++static inline struct bpos bpos_max(struct bpos l, struct bpos r)
++{
++	return bpos_cmp(l, r) > 0 ? l : r;
++}
++
++void bch2_bpos_swab(struct bpos *);
++void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
++
++static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
++{
++	return  cmp_int(l.hi, r.hi) ?:
++		cmp_int(l.lo, r.lo);
++}
++
++#define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
++#define MAX_VERSION	((struct bversion) { .hi = ~0, .lo = ~0ULL })
++
++static __always_inline int bversion_zero(struct bversion v)
++{
++	return !bversion_cmp(v, ZERO_VERSION);
++}
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++/* statement expressions confusing unlikely()? */
++#define bkey_packed(_k)							\
++	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
++	 (_k)->format != KEY_FORMAT_CURRENT; })
++#else
++#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
++#endif
++
++/*
++ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
++ */
++static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
++{
++	return (struct bkey_packed *) k;
++}
++
++static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
++{
++	return (const struct bkey_packed *) k;
++}
++
++static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
++{
++	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
++}
++
++static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
++{
++	return bkey_packed(k) ? NULL : (const struct bkey *) k;
++}
++
++static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
++{
++	return format->bits_per_field[BKEY_FIELD_INODE] +
++		format->bits_per_field[BKEY_FIELD_OFFSET] +
++		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
++}
++
++static inline struct bpos bpos_successor(struct bpos p)
++{
++	if (!++p.snapshot &&
++	    !++p.offset &&
++	    !++p.inode)
++		BUG();
++
++	return p;
++}
++
++static inline struct bpos bpos_predecessor(struct bpos p)
++{
++	if (!p.snapshot-- &&
++	    !p.offset-- &&
++	    !p.inode--)
++		BUG();
++
++	return p;
++}
++
++static inline struct bpos bpos_nosnap_successor(struct bpos p)
++{
++	p.snapshot = 0;
++
++	if (!++p.offset &&
++	    !++p.inode)
++		BUG();
++
++	return p;
++}
++
++static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
++{
++	p.snapshot = 0;
++
++	if (!p.offset-- &&
++	    !p.inode--)
++		BUG();
++
++	return p;
++}
++
++static inline u64 bkey_start_offset(const struct bkey *k)
++{
++	return k->p.offset - k->size;
++}
++
++static inline struct bpos bkey_start_pos(const struct bkey *k)
++{
++	return (struct bpos) {
++		.inode		= k->p.inode,
++		.offset		= bkey_start_offset(k),
++		.snapshot	= k->p.snapshot,
++	};
++}
++
++/* Packed helpers */
++
++static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
++				      const struct bkey_packed *k)
++{
++	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
++
++	EBUG_ON(k->u64s < ret);
++	return ret;
++}
++
++static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
++				       const struct bkey_packed *k)
++{
++	return bkeyp_key_u64s(format, k) * sizeof(u64);
++}
++
++static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
++				      const struct bkey_packed *k)
++{
++	return k->u64s - bkeyp_key_u64s(format, k);
++}
++
++static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
++				     const struct bkey_packed *k)
++{
++	return bkeyp_val_u64s(format, k) * sizeof(u64);
++}
++
++static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
++				      struct bkey_packed *k, unsigned val_u64s)
++{
++	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
++}
++
++#define bkeyp_val(_format, _k)						\
++	 ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
++
++extern const struct bkey_format bch2_bkey_format_current;
++
++bool bch2_bkey_transform(const struct bkey_format *,
++			 struct bkey_packed *,
++			 const struct bkey_format *,
++			 const struct bkey_packed *);
++
++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
++				   const struct bkey_packed *);
++
++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
++struct bpos __bkey_unpack_pos(const struct bkey_format *,
++			      const struct bkey_packed *);
++#endif
++
++bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
++		   const struct bkey_format *);
++
++enum bkey_pack_pos_ret {
++	BKEY_PACK_POS_EXACT,
++	BKEY_PACK_POS_SMALLER,
++	BKEY_PACK_POS_FAIL,
++};
++
++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
++					   const struct btree *);
++
++static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
++				 const struct btree *b)
++{
++	return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
++}
++
++void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
++		 const struct bkey_packed *);
++bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
++	       const struct bkey_format *);
++
++static inline u64 bkey_field_max(const struct bkey_format *f,
++				 enum bch_bkey_fields nr)
++{
++	return f->bits_per_field[nr] < 64
++		? (le64_to_cpu(f->field_offset[nr]) +
++		   ~(~0ULL << f->bits_per_field[nr]))
++		: U64_MAX;
++}
++
++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
++
++int bch2_compile_bkey_format(const struct bkey_format *, void *);
++
++#else
++
++static inline int bch2_compile_bkey_format(const struct bkey_format *format,
++					  void *out) { return 0; }
++
++#endif
++
++static inline void bkey_reassemble(struct bkey_i *dst,
++				   struct bkey_s_c src)
++{
++	dst->k = *src.k;
++	memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
++}
++
++#define bkey_s_null		((struct bkey_s)   { .k = NULL })
++#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
++
++#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
++#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
++
++static inline struct bkey_s bkey_to_s(struct bkey *k)
++{
++	return (struct bkey_s) { .k = k, .v = NULL };
++}
++
++static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
++{
++	return (struct bkey_s_c) { .k = k, .v = NULL };
++}
++
++static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
++{
++	return (struct bkey_s) { .k = &k->k, .v = &k->v };
++}
++
++static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
++{
++	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
++}
++
++/*
++ * For a given type of value (e.g. struct bch_extent), generates the types for
++ * bkey + bch_extent - inline, split, split const - and also all the conversion
++ * functions, which also check that the value is of the correct type.
++ *
++ * We use anonymous unions for upcasting - e.g. converting from e.g. a
++ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
++ * functions.
++ */
++#define x(name, ...)					\
++struct bkey_i_##name {							\
++	union {								\
++		struct bkey		k;				\
++		struct bkey_i		k_i;				\
++	};								\
++	struct bch_##name		v;				\
++};									\
++									\
++struct bkey_s_c_##name {						\
++	union {								\
++	struct {							\
++		const struct bkey	*k;				\
++		const struct bch_##name	*v;				\
++	};								\
++	struct bkey_s_c			s_c;				\
++	};								\
++};									\
++									\
++struct bkey_s_##name {							\
++	union {								\
++	struct {							\
++		struct bkey		*k;				\
++		struct bch_##name	*v;				\
++	};								\
++	struct bkey_s_c_##name		c;				\
++	struct bkey_s			s;				\
++	struct bkey_s_c			s_c;				\
++	};								\
++};									\
++									\
++static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
++{									\
++	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
++	return container_of(&k->k, struct bkey_i_##name, k);		\
++}									\
++									\
++static inline const struct bkey_i_##name *				\
++bkey_i_to_##name##_c(const struct bkey_i *k)				\
++{									\
++	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
++	return container_of(&k->k, struct bkey_i_##name, k);		\
++}									\
++									\
++static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
++{									\
++	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
++	return (struct bkey_s_##name) {					\
++		.k = k.k,						\
++		.v = container_of(k.v, struct bch_##name, v),		\
++	};								\
++}									\
++									\
++static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
++{									\
++	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
++	return (struct bkey_s_c_##name) {				\
++		.k = k.k,						\
++		.v = container_of(k.v, struct bch_##name, v),		\
++	};								\
++}									\
++									\
++static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
++{									\
++	return (struct bkey_s_##name) {					\
++		.k = &k->k,						\
++		.v = &k->v,						\
++	};								\
++}									\
++									\
++static inline struct bkey_s_c_##name					\
++name##_i_to_s_c(const struct bkey_i_##name *k)				\
++{									\
++	return (struct bkey_s_c_##name) {				\
++		.k = &k->k,						\
++		.v = &k->v,						\
++	};								\
++}									\
++									\
++static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
++{									\
++	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
++	return (struct bkey_s_##name) {					\
++		.k = &k->k,						\
++		.v = container_of(&k->v, struct bch_##name, v),		\
++	};								\
++}									\
++									\
++static inline struct bkey_s_c_##name					\
++bkey_i_to_s_c_##name(const struct bkey_i *k)				\
++{									\
++	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
++	return (struct bkey_s_c_##name) {				\
++		.k = &k->k,						\
++		.v = container_of(&k->v, struct bch_##name, v),		\
++	};								\
++}									\
++									\
++static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
++{									\
++	struct bkey_i_##name *k =					\
++		container_of(&_k->k, struct bkey_i_##name, k);		\
++									\
++	bkey_init(&k->k);						\
++	memset(&k->v, 0, sizeof(k->v));					\
++	k->k.type = KEY_TYPE_##name;					\
++	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
++									\
++	return k;							\
++}
++
++BCH_BKEY_TYPES();
++#undef x
++
++/* byte order helpers */
++
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++
++static inline unsigned high_word_offset(const struct bkey_format *f)
++{
++	return f->key_u64s - 1;
++}
++
++#define high_bit_offset		0
++#define nth_word(p, n)		((p) - (n))
++
++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++
++static inline unsigned high_word_offset(const struct bkey_format *f)
++{
++	return 0;
++}
++
++#define high_bit_offset		KEY_PACKED_BITS_START
++#define nth_word(p, n)		((p) + (n))
++
++#else
++#error edit for your odd byteorder.
++#endif
++
++#define high_word(f, k)		((k)->_data + high_word_offset(f))
++#define next_word(p)		nth_word(p, 1)
++#define prev_word(p)		nth_word(p, -1)
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++void bch2_bkey_pack_test(void);
++#else
++static inline void bch2_bkey_pack_test(void) {}
++#endif
++
++#endif /* _BCACHEFS_BKEY_H */
+diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
+new file mode 100644
+index 000000000000..0d7c67a959af
+--- /dev/null
++++ b/fs/bcachefs/bkey_buf.h
+@@ -0,0 +1,60 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BKEY_BUF_H
++#define _BCACHEFS_BKEY_BUF_H
++
++#include "bcachefs.h"
++
++struct bkey_buf {
++	struct bkey_i	*k;
++	u64		onstack[12];
++};
++
++static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
++					 struct bch_fs *c, unsigned u64s)
++{
++	if (s->k == (void *) s->onstack &&
++	    u64s > ARRAY_SIZE(s->onstack)) {
++		s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
++		memcpy(s->k, s->onstack, sizeof(s->onstack));
++	}
++}
++
++static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
++					    struct bch_fs *c,
++					    struct bkey_s_c k)
++{
++	bch2_bkey_buf_realloc(s, c, k.k->u64s);
++	bkey_reassemble(s->k, k);
++}
++
++static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
++				      struct bch_fs *c,
++				      struct bkey_i *src)
++{
++	bch2_bkey_buf_realloc(s, c, src->k.u64s);
++	bkey_copy(s->k, src);
++}
++
++static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
++					struct bch_fs *c,
++					struct btree *b,
++					struct bkey_packed *src)
++{
++	bch2_bkey_buf_realloc(s, c, BKEY_U64s +
++			      bkeyp_val_u64s(&b->format, src));
++	bch2_bkey_unpack(b, s->k, src);
++}
++
++static inline void bch2_bkey_buf_init(struct bkey_buf *s)
++{
++	s->k = (void *) s->onstack;
++}
++
++static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
++{
++	if (s->k != (void *) s->onstack)
++		mempool_free(s->k, &c->large_bkey_pool);
++	s->k = NULL;
++}
++
++#endif /* _BCACHEFS_BKEY_BUF_H */
+diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
+new file mode 100644
+index 000000000000..e0cbac8811af
+--- /dev/null
++++ b/fs/bcachefs/bkey_methods.c
+@@ -0,0 +1,503 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "backpointers.h"
++#include "bkey_methods.h"
++#include "btree_types.h"
++#include "alloc_background.h"
++#include "dirent.h"
++#include "ec.h"
++#include "error.h"
++#include "extents.h"
++#include "inode.h"
++#include "lru.h"
++#include "quota.h"
++#include "reflink.h"
++#include "subvolume.h"
++#include "xattr.h"
++
++const char * const bch2_bkey_types[] = {
++#define x(name, nr) #name,
++	BCH_BKEY_TYPES()
++#undef x
++	NULL
++};
++
++static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			       int rw, struct printbuf *err)
++{
++	return 0;
++}
++
++#define bch2_bkey_ops_deleted (struct bkey_ops) {	\
++	.key_invalid = deleted_key_invalid,		\
++}
++
++#define bch2_bkey_ops_whiteout (struct bkey_ops) {	\
++	.key_invalid = deleted_key_invalid,		\
++}
++
++static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
++				 int rw, struct printbuf *err)
++{
++	if (bkey_val_bytes(k.k)) {
++		prt_printf(err, "incorrect value size (%zu != 0)",
++		       bkey_val_bytes(k.k));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++#define bch2_bkey_ops_error (struct bkey_ops) {		\
++	.key_invalid = empty_val_key_invalid,		\
++}
++
++static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
++				   int rw, struct printbuf *err)
++{
++	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
++		prt_printf(err, "incorrect value size (%zu != %zu)",
++		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++#define bch2_bkey_ops_cookie (struct bkey_ops) {	\
++	.key_invalid = key_type_cookie_invalid,		\
++}
++
++#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) {	\
++	.key_invalid = empty_val_key_invalid,		\
++}
++
++static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
++					int rw, struct printbuf *err)
++{
++	return 0;
++}
++
++static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
++					 struct bkey_s_c k)
++{
++	struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
++	unsigned datalen = bkey_inline_data_bytes(k.k);
++
++	prt_printf(out, "datalen %u: %*phN",
++	       datalen, min(datalen, 32U), d.v->data);
++}
++
++#define bch2_bkey_ops_inline_data (struct bkey_ops) {	\
++	.key_invalid	= key_type_inline_data_invalid,	\
++	.val_to_text	= key_type_inline_data_to_text,	\
++}
++
++static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
++				int rw, struct printbuf *err)
++{
++	if (bkey_val_bytes(k.k)) {
++		prt_printf(err, "incorrect value size (%zu != %zu)",
++		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
++{
++	bch2_key_resize(l.k, l.k->size + r.k->size);
++	return true;
++}
++
++#define bch2_bkey_ops_set (struct bkey_ops) {		\
++	.key_invalid	= key_type_set_invalid,		\
++	.key_merge	= key_type_set_merge,		\
++}
++
++const struct bkey_ops bch2_bkey_ops[] = {
++#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
++	BCH_BKEY_TYPES()
++#undef x
++};
++
++int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
++			  int rw, struct printbuf *err)
++{
++	if (k.k->type >= KEY_TYPE_MAX) {
++		prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
++		return -EINVAL;
++	}
++
++	return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err);
++}
++
++static unsigned bch2_key_types_allowed[] = {
++	[BKEY_TYPE_extents] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_whiteout)|
++		(1U << KEY_TYPE_error)|
++		(1U << KEY_TYPE_cookie)|
++		(1U << KEY_TYPE_extent)|
++		(1U << KEY_TYPE_reservation)|
++		(1U << KEY_TYPE_reflink_p)|
++		(1U << KEY_TYPE_inline_data),
++	[BKEY_TYPE_inodes] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_whiteout)|
++		(1U << KEY_TYPE_inode)|
++		(1U << KEY_TYPE_inode_v2)|
++		(1U << KEY_TYPE_inode_generation),
++	[BKEY_TYPE_dirents] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_whiteout)|
++		(1U << KEY_TYPE_hash_whiteout)|
++		(1U << KEY_TYPE_dirent),
++	[BKEY_TYPE_xattrs] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_whiteout)|
++		(1U << KEY_TYPE_cookie)|
++		(1U << KEY_TYPE_hash_whiteout)|
++		(1U << KEY_TYPE_xattr),
++	[BKEY_TYPE_alloc] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_alloc)|
++		(1U << KEY_TYPE_alloc_v2)|
++		(1U << KEY_TYPE_alloc_v3)|
++		(1U << KEY_TYPE_alloc_v4),
++	[BKEY_TYPE_quotas] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_quota),
++	[BKEY_TYPE_stripes] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_stripe),
++	[BKEY_TYPE_reflink] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_reflink_v)|
++		(1U << KEY_TYPE_indirect_inline_data),
++	[BKEY_TYPE_subvolumes] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_subvolume),
++	[BKEY_TYPE_snapshots] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_snapshot),
++	[BKEY_TYPE_lru] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_lru),
++	[BKEY_TYPE_freespace] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_set),
++	[BKEY_TYPE_need_discard] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_set),
++	[BKEY_TYPE_backpointers] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_backpointer),
++	[BKEY_TYPE_btree] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_btree_ptr)|
++		(1U << KEY_TYPE_btree_ptr_v2),
++};
++
++int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
++			enum btree_node_type type,
++			int rw, struct printbuf *err)
++{
++	if (k.k->u64s < BKEY_U64s) {
++		prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
++		return -EINVAL;
++	}
++
++	if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
++		prt_printf(err, "invalid key type for btree %s (%s)",
++			   bch2_btree_ids[type], bch2_bkey_types[type]);
++		return -EINVAL;
++	}
++
++	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
++		if (k.k->size == 0) {
++			prt_printf(err, "size == 0");
++			return -EINVAL;
++		}
++
++		if (k.k->size > k.k->p.offset) {
++			prt_printf(err, "size greater than offset (%u > %llu)",
++			       k.k->size, k.k->p.offset);
++			return -EINVAL;
++		}
++	} else {
++		if (k.k->size) {
++			prt_printf(err, "size != 0");
++			return -EINVAL;
++		}
++	}
++
++	if (type != BKEY_TYPE_btree &&
++	    !btree_type_has_snapshots(type) &&
++	    k.k->p.snapshot) {
++		prt_printf(err, "nonzero snapshot");
++		return -EINVAL;
++	}
++
++	if (type != BKEY_TYPE_btree &&
++	    btree_type_has_snapshots(type) &&
++	    !k.k->p.snapshot) {
++		prt_printf(err, "snapshot == 0");
++		return -EINVAL;
++	}
++
++	if (type != BKEY_TYPE_btree &&
++	    !bkey_cmp(k.k->p, POS_MAX)) {
++		prt_printf(err, "key at POS_MAX");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
++		      enum btree_node_type type,
++		      int rw, struct printbuf *err)
++{
++	return __bch2_bkey_invalid(c, k, type, rw, err) ?:
++		bch2_bkey_val_invalid(c, k, rw, err);
++}
++
++int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
++			    struct printbuf *err)
++{
++	if (bpos_cmp(k.k->p, b->data->min_key) < 0) {
++		prt_printf(err, "key before start of btree node");
++		return -EINVAL;
++	}
++
++	if (bpos_cmp(k.k->p, b->data->max_key) > 0) {
++		prt_printf(err, "key past end of btree node");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
++{
++	if (!bpos_cmp(pos, POS_MIN))
++		prt_printf(out, "POS_MIN");
++	else if (!bpos_cmp(pos, POS_MAX))
++		prt_printf(out, "POS_MAX");
++	else if (!bpos_cmp(pos, SPOS_MAX))
++		prt_printf(out, "SPOS_MAX");
++	else {
++		if (pos.inode == U64_MAX)
++			prt_printf(out, "U64_MAX");
++		else
++			prt_printf(out, "%llu", pos.inode);
++		prt_printf(out, ":");
++		if (pos.offset == U64_MAX)
++			prt_printf(out, "U64_MAX");
++		else
++			prt_printf(out, "%llu", pos.offset);
++		prt_printf(out, ":");
++		if (pos.snapshot == U32_MAX)
++			prt_printf(out, "U32_MAX");
++		else
++			prt_printf(out, "%u", pos.snapshot);
++	}
++}
++
++void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
++{
++	if (k) {
++		prt_printf(out, "u64s %u type ", k->u64s);
++
++		if (k->type < KEY_TYPE_MAX)
++			prt_printf(out, "%s ", bch2_bkey_types[k->type]);
++		else
++			prt_printf(out, "%u ", k->type);
++
++		bch2_bpos_to_text(out, k->p);
++
++		prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
++	} else {
++		prt_printf(out, "(null)");
++	}
++}
++
++void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
++		      struct bkey_s_c k)
++{
++	if (k.k->type < KEY_TYPE_MAX) {
++		const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
++
++		if (likely(ops->val_to_text))
++			ops->val_to_text(out, c, k);
++	} else {
++		prt_printf(out, "(invalid type %u)", k.k->type);
++	}
++}
++
++void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
++			   struct bkey_s_c k)
++{
++	bch2_bkey_to_text(out, k.k);
++
++	if (bkey_val_bytes(k.k)) {
++		prt_printf(out, ": ");
++		bch2_val_to_text(out, c, k);
++	}
++}
++
++void bch2_bkey_swab_val(struct bkey_s k)
++{
++	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
++
++	if (ops->swab)
++		ops->swab(k);
++}
++
++bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
++{
++	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
++
++	return ops->key_normalize
++		? ops->key_normalize(c, k)
++		: false;
++}
++
++bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
++{
++	const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
++
++	return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r);
++}
++
++static const struct old_bkey_type {
++	u8		btree_node_type;
++	u8		old;
++	u8		new;
++} bkey_renumber_table[] = {
++	{BKEY_TYPE_btree,	128, KEY_TYPE_btree_ptr		},
++	{BKEY_TYPE_extents,	128, KEY_TYPE_extent		},
++	{BKEY_TYPE_extents,	129, KEY_TYPE_extent		},
++	{BKEY_TYPE_extents,	130, KEY_TYPE_reservation	},
++	{BKEY_TYPE_inodes,	128, KEY_TYPE_inode		},
++	{BKEY_TYPE_inodes,	130, KEY_TYPE_inode_generation	},
++	{BKEY_TYPE_dirents,	128, KEY_TYPE_dirent		},
++	{BKEY_TYPE_dirents,	129, KEY_TYPE_hash_whiteout	},
++	{BKEY_TYPE_xattrs,	128, KEY_TYPE_xattr		},
++	{BKEY_TYPE_xattrs,	129, KEY_TYPE_hash_whiteout	},
++	{BKEY_TYPE_alloc,	128, KEY_TYPE_alloc		},
++	{BKEY_TYPE_quotas,	128, KEY_TYPE_quota		},
++};
++
++void bch2_bkey_renumber(enum btree_node_type btree_node_type,
++			struct bkey_packed *k,
++			int write)
++{
++	const struct old_bkey_type *i;
++
++	for (i = bkey_renumber_table;
++	     i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
++	     i++)
++		if (btree_node_type == i->btree_node_type &&
++		    k->type == (write ? i->new : i->old)) {
++			k->type = write ? i->old : i->new;
++			break;
++		}
++}
++
++void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
++			unsigned version, unsigned big_endian,
++			int write,
++			struct bkey_format *f,
++			struct bkey_packed *k)
++{
++	const struct bkey_ops *ops;
++	struct bkey uk;
++	struct bkey_s u;
++	unsigned nr_compat = 5;
++	int i;
++
++	/*
++	 * Do these operations in reverse order in the write path:
++	 */
++
++	for (i = 0; i < nr_compat; i++)
++	switch (!write ? i : nr_compat - 1 - i) {
++	case 0:
++		if (big_endian != CPU_BIG_ENDIAN)
++			bch2_bkey_swab_key(f, k);
++		break;
++	case 1:
++		if (version < bcachefs_metadata_version_bkey_renumber)
++			bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
++		break;
++	case 2:
++		if (version < bcachefs_metadata_version_inode_btree_change &&
++		    btree_id == BTREE_ID_inodes) {
++			if (!bkey_packed(k)) {
++				struct bkey_i *u = packed_to_bkey(k);
++				swap(u->k.p.inode, u->k.p.offset);
++			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
++				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
++				struct bkey_format tmp = *f, *in = f, *out = &tmp;
++
++				swap(tmp.bits_per_field[BKEY_FIELD_INODE],
++				     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
++				swap(tmp.field_offset[BKEY_FIELD_INODE],
++				     tmp.field_offset[BKEY_FIELD_OFFSET]);
++
++				if (!write)
++					swap(in, out);
++
++				uk = __bch2_bkey_unpack_key(in, k);
++				swap(uk.p.inode, uk.p.offset);
++				BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
++			}
++		}
++		break;
++	case 3:
++		if (version < bcachefs_metadata_version_snapshot &&
++		    (level || btree_type_has_snapshots(btree_id))) {
++			struct bkey_i *u = packed_to_bkey(k);
++
++			if (u) {
++				u->k.p.snapshot = write
++					? 0 : U32_MAX;
++			} else {
++				u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
++				u64 max_packed = min_packed +
++					~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
++
++				uk = __bch2_bkey_unpack_key(f, k);
++				uk.p.snapshot = write
++					? min_packed : min_t(u64, U32_MAX, max_packed);
++
++				BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
++			}
++		}
++
++		break;
++	case 4:
++		if (!bkey_packed(k)) {
++			u = bkey_i_to_s(packed_to_bkey(k));
++		} else {
++			uk = __bch2_bkey_unpack_key(f, k);
++			u.k = &uk;
++			u.v = bkeyp_val(f, k);
++		}
++
++		if (big_endian != CPU_BIG_ENDIAN)
++			bch2_bkey_swab_val(u);
++
++		ops = &bch2_bkey_ops[k->type];
++
++		if (ops->compat)
++			ops->compat(btree_id, version, big_endian, write, u);
++		break;
++	default:
++		BUG();
++	}
++}
+diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
+new file mode 100644
+index 000000000000..db894b40d2ca
+--- /dev/null
++++ b/fs/bcachefs/bkey_methods.h
+@@ -0,0 +1,175 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BKEY_METHODS_H
++#define _BCACHEFS_BKEY_METHODS_H
++
++#include "bkey.h"
++
++struct bch_fs;
++struct btree;
++struct btree_trans;
++struct bkey;
++enum btree_node_type;
++
++extern const char * const bch2_bkey_types[];
++
++/*
++ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
++ * invalid, entire key will be deleted.
++ *
++ * When invalid, error string is returned via @err. @rw indicates whether key is
++ * being read or written; more aggressive checks can be enabled when rw == WRITE.
++*/
++struct bkey_ops {
++	int		(*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
++				       int rw, struct printbuf *err);
++	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
++				       struct bkey_s_c);
++	void		(*swab)(struct bkey_s);
++	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
++	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
++	int		(*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
++					 struct bkey_s_c, struct bkey_i *, unsigned);
++	int		(*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
++					  struct bkey_s_c, unsigned);
++	void		(*compat)(enum btree_id id, unsigned version,
++				  unsigned big_endian, int write,
++				  struct bkey_s);
++};
++
++extern const struct bkey_ops bch2_bkey_ops[];
++
++int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
++			enum btree_node_type, int, struct printbuf *);
++int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
++		      enum btree_node_type, int, struct printbuf *);
++int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
++
++void bch2_bpos_to_text(struct printbuf *, struct bpos);
++void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
++void bch2_val_to_text(struct printbuf *, struct bch_fs *,
++		      struct bkey_s_c);
++void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
++			   struct bkey_s_c);
++
++void bch2_bkey_swab_val(struct bkey_s);
++
++bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
++
++static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
++{
++	return l->type == r->type &&
++		!bversion_cmp(l->version, r->version) &&
++		!bpos_cmp(l->p, bkey_start_pos(r)) &&
++		(u64) l->size + r->size <= KEY_SIZE_MAX &&
++		bch2_bkey_ops[l->type].key_merge &&
++		!bch2_key_merging_disabled;
++}
++
++bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
++
++static inline int bch2_mark_key(struct btree_trans *trans,
++		  struct bkey_s_c old,
++		  struct bkey_s_c new,
++		  unsigned flags)
++{
++	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
++
++	return ops->atomic_trigger
++		? ops->atomic_trigger(trans, old, new, flags)
++		: 0;
++}
++
++enum btree_update_flags {
++	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
++	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
++
++	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
++
++	__BTREE_TRIGGER_INSERT,
++	__BTREE_TRIGGER_OVERWRITE,
++
++	__BTREE_TRIGGER_GC,
++	__BTREE_TRIGGER_BUCKET_INVALIDATE,
++	__BTREE_TRIGGER_NOATOMIC,
++};
++
++#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
++#define BTREE_UPDATE_KEY_CACHE_RECLAIM	(1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
++
++#define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
++
++#define BTREE_TRIGGER_INSERT		(1U << __BTREE_TRIGGER_INSERT)
++#define BTREE_TRIGGER_OVERWRITE		(1U << __BTREE_TRIGGER_OVERWRITE)
++
++#define BTREE_TRIGGER_GC		(1U << __BTREE_TRIGGER_GC)
++#define BTREE_TRIGGER_BUCKET_INVALIDATE	(1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
++#define BTREE_TRIGGER_NOATOMIC		(1U << __BTREE_TRIGGER_NOATOMIC)
++
++#define BTREE_TRIGGER_WANTS_OLD_AND_NEW		\
++	((1U << KEY_TYPE_alloc)|		\
++	 (1U << KEY_TYPE_alloc_v2)|		\
++	 (1U << KEY_TYPE_alloc_v3)|		\
++	 (1U << KEY_TYPE_alloc_v4)|		\
++	 (1U << KEY_TYPE_stripe)|		\
++	 (1U << KEY_TYPE_inode)|		\
++	 (1U << KEY_TYPE_inode_v2)|		\
++	 (1U << KEY_TYPE_snapshot))
++
++static inline int bch2_trans_mark_key(struct btree_trans *trans,
++				      enum btree_id btree_id, unsigned level,
++				      struct bkey_s_c old, struct bkey_i *new,
++				      unsigned flags)
++{
++	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
++
++	return ops->trans_trigger
++		? ops->trans_trigger(trans, btree_id, level, old, new, flags)
++		: 0;
++}
++
++static inline int bch2_trans_mark_old(struct btree_trans *trans,
++				      enum btree_id btree_id, unsigned level,
++				      struct bkey_s_c old, unsigned flags)
++{
++	struct bkey_i deleted;
++
++	bkey_init(&deleted.k);
++	deleted.k.p = old.k->p;
++
++	return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
++				   BTREE_TRIGGER_OVERWRITE|flags);
++}
++
++static inline int bch2_trans_mark_new(struct btree_trans *trans,
++				      enum btree_id btree_id, unsigned level,
++				      struct bkey_i *new, unsigned flags)
++{
++	struct bkey_i deleted;
++
++	bkey_init(&deleted.k);
++	deleted.k.p = new->k.p;
++
++	return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
++				   BTREE_TRIGGER_INSERT|flags);
++}
++
++void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
++
++void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
++			int, struct bkey_format *, struct bkey_packed *);
++
++static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
++			       unsigned version, unsigned big_endian,
++			       int write,
++			       struct bkey_format *f,
++			       struct bkey_packed *k)
++{
++	if (version < bcachefs_metadata_version_current ||
++	    big_endian != CPU_BIG_ENDIAN)
++		__bch2_bkey_compat(level, btree_id, version,
++				   big_endian, write, f, k);
++
++}
++
++#endif /* _BCACHEFS_BKEY_METHODS_H */
+diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
+new file mode 100644
+index 000000000000..b1385a77da11
+--- /dev/null
++++ b/fs/bcachefs/bkey_sort.c
+@@ -0,0 +1,198 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "bkey_buf.h"
++#include "bkey_sort.h"
++#include "bset.h"
++#include "extents.h"
++
++typedef int (*sort_cmp_fn)(struct btree *,
++			   struct bkey_packed *,
++			   struct bkey_packed *);
++
++static inline bool sort_iter_end(struct sort_iter *iter)
++{
++	return !iter->used;
++}
++
++static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
++				  sort_cmp_fn cmp)
++{
++	unsigned i;
++
++	for (i = from;
++	     i + 1 < iter->used &&
++	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
++	     i++)
++		swap(iter->data[i], iter->data[i + 1]);
++}
++
++static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
++{
++	unsigned i = iter->used;
++
++	while (i--)
++		sort_iter_sift(iter, i, cmp);
++}
++
++static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
++{
++	return !sort_iter_end(iter) ? iter->data->k : NULL;
++}
++
++static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
++{
++	struct sort_iter_set *i = iter->data;
++
++	BUG_ON(!iter->used);
++
++	i->k = bkey_next(i->k);
++
++	BUG_ON(i->k > i->end);
++
++	if (i->k == i->end)
++		array_remove_item(iter->data, iter->used, 0);
++	else
++		sort_iter_sift(iter, 0, cmp);
++}
++
++static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
++						 sort_cmp_fn cmp)
++{
++	struct bkey_packed *ret = sort_iter_peek(iter);
++
++	if (ret)
++		sort_iter_advance(iter, cmp);
++
++	return ret;
++}
++
++/*
++ * If keys compare equal, compare by pointer order:
++ */
++static inline int key_sort_fix_overlapping_cmp(struct btree *b,
++					       struct bkey_packed *l,
++					       struct bkey_packed *r)
++{
++	return bch2_bkey_cmp_packed(b, l, r) ?:
++		cmp_int((unsigned long) l, (unsigned long) r);
++}
++
++static inline bool should_drop_next_key(struct sort_iter *iter)
++{
++	/*
++	 * key_sort_cmp() ensures that when keys compare equal the older key
++	 * comes first; so if l->k compares equal to r->k then l->k is older
++	 * and should be dropped.
++	 */
++	return iter->used >= 2 &&
++		!bch2_bkey_cmp_packed(iter->b,
++				 iter->data[0].k,
++				 iter->data[1].k);
++}
++
++struct btree_nr_keys
++bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
++			      struct sort_iter *iter)
++{
++	struct bkey_packed *out = dst->start;
++	struct bkey_packed *k;
++	struct btree_nr_keys nr;
++
++	memset(&nr, 0, sizeof(nr));
++
++	sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
++
++	while ((k = sort_iter_peek(iter))) {
++		if (!bkey_deleted(k) &&
++		    !should_drop_next_key(iter)) {
++			bkey_copy(out, k);
++			btree_keys_account_key_add(&nr, 0, out);
++			out = bkey_next(out);
++		}
++
++		sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
++	}
++
++	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
++	return nr;
++}
++
++/* Sort + repack in a new format: */
++struct btree_nr_keys
++bch2_sort_repack(struct bset *dst, struct btree *src,
++		 struct btree_node_iter *src_iter,
++		 struct bkey_format *out_f,
++		 bool filter_whiteouts)
++{
++	struct bkey_format *in_f = &src->format;
++	struct bkey_packed *in, *out = vstruct_last(dst);
++	struct btree_nr_keys nr;
++	bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
++
++	memset(&nr, 0, sizeof(nr));
++
++	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
++		if (filter_whiteouts && bkey_deleted(in))
++			continue;
++
++		if (!transform)
++			bkey_copy(out, in);
++		else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
++					     ? in_f : &bch2_bkey_format_current, in))
++			out->format = KEY_FORMAT_LOCAL_BTREE;
++		else
++			bch2_bkey_unpack(src, (void *) out, in);
++
++		btree_keys_account_key_add(&nr, 0, out);
++		out = bkey_next(out);
++	}
++
++	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
++	return nr;
++}
++
++static inline int sort_keys_cmp(struct btree *b,
++				struct bkey_packed *l,
++				struct bkey_packed *r)
++{
++	return bch2_bkey_cmp_packed(b, l, r) ?:
++		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
++		(int) l->needs_whiteout - (int) r->needs_whiteout;
++}
++
++unsigned bch2_sort_keys(struct bkey_packed *dst,
++			struct sort_iter *iter,
++			bool filter_whiteouts)
++{
++	const struct bkey_format *f = &iter->b->format;
++	struct bkey_packed *in, *next, *out = dst;
++
++	sort_iter_sort(iter, sort_keys_cmp);
++
++	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
++		bool needs_whiteout = false;
++
++		if (bkey_deleted(in) &&
++		    (filter_whiteouts || !in->needs_whiteout))
++			continue;
++
++		while ((next = sort_iter_peek(iter)) &&
++		       !bch2_bkey_cmp_packed(iter->b, in, next)) {
++			BUG_ON(in->needs_whiteout &&
++			       next->needs_whiteout);
++			needs_whiteout |= in->needs_whiteout;
++			in = sort_iter_next(iter, sort_keys_cmp);
++		}
++
++		if (bkey_deleted(in)) {
++			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
++			set_bkeyp_val_u64s(f, out, 0);
++		} else {
++			bkey_copy(out, in);
++		}
++		out->needs_whiteout |= needs_whiteout;
++		out = bkey_next(out);
++	}
++
++	return (u64 *) out - (u64 *) dst;
++}
+diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
+new file mode 100644
+index 000000000000..79cf11d1b4e7
+--- /dev/null
++++ b/fs/bcachefs/bkey_sort.h
+@@ -0,0 +1,44 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BKEY_SORT_H
++#define _BCACHEFS_BKEY_SORT_H
++
++struct sort_iter {
++	struct btree		*b;
++	unsigned		used;
++	unsigned		size;
++
++	struct sort_iter_set {
++		struct bkey_packed *k, *end;
++	} data[MAX_BSETS + 1];
++};
++
++static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
++{
++	iter->b = b;
++	iter->used = 0;
++	iter->size = ARRAY_SIZE(iter->data);
++}
++
++static inline void sort_iter_add(struct sort_iter *iter,
++				 struct bkey_packed *k,
++				 struct bkey_packed *end)
++{
++	BUG_ON(iter->used >= iter->size);
++
++	if (k != end)
++		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
++}
++
++struct btree_nr_keys
++bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
++			      struct sort_iter *);
++
++struct btree_nr_keys
++bch2_sort_repack(struct bset *, struct btree *,
++		 struct btree_node_iter *,
++		 struct bkey_format *, bool);
++
++unsigned bch2_sort_keys(struct bkey_packed *,
++			struct sort_iter *, bool);
++
++#endif /* _BCACHEFS_BKEY_SORT_H */
+diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
+new file mode 100644
+index 000000000000..fa60ef84e4ef
+--- /dev/null
++++ b/fs/bcachefs/bset.c
+@@ -0,0 +1,1598 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Code for working with individual keys, and sorted sets of keys with in a
++ * btree node
++ *
++ * Copyright 2012 Google, Inc.
++ */
++
++#include "bcachefs.h"
++#include "btree_cache.h"
++#include "bset.h"
++#include "eytzinger.h"
++#include "util.h"
++
++#include <asm/unaligned.h>
++#include <linux/console.h>
++#include <linux/random.h>
++#include <linux/prefetch.h>
++
++/* hack.. */
++#include "alloc_types.h"
++#include <trace/events/bcachefs.h>
++
++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
++						  struct btree *);
++
++static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
++{
++	unsigned n = ARRAY_SIZE(iter->data);
++
++	while (n && __btree_node_iter_set_end(iter, n - 1))
++		--n;
++
++	return n;
++}
++
++struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
++{
++	unsigned offset = __btree_node_key_to_offset(b, k);
++	struct bset_tree *t;
++
++	for_each_bset(b, t)
++		if (offset <= t->end_offset) {
++			EBUG_ON(offset < btree_bkey_first_offset(t));
++			return t;
++		}
++
++	BUG();
++}
++
++/*
++ * There are never duplicate live keys in the btree - but including keys that
++ * have been flagged as deleted (and will be cleaned up later) we _will_ see
++ * duplicates.
++ *
++ * Thus the sort order is: usual key comparison first, but for keys that compare
++ * equal the deleted key(s) come first, and the (at most one) live version comes
++ * last.
++ *
++ * The main reason for this is insertion: to handle overwrites, we first iterate
++ * over keys that compare equal to our insert key, and then insert immediately
++ * prior to the first key greater than the key we're inserting - our insert
++ * position will be after all keys that compare equal to our insert key, which
++ * by the time we actually do the insert will all be deleted.
++ */
++
++void bch2_dump_bset(struct bch_fs *c, struct btree *b,
++		    struct bset *i, unsigned set)
++{
++	struct bkey_packed *_k, *_n;
++	struct bkey uk, n;
++	struct bkey_s_c k;
++	struct printbuf buf = PRINTBUF;
++
++	if (!i->u64s)
++		return;
++
++	for (_k = i->start;
++	     _k < vstruct_last(i);
++	     _k = _n) {
++		_n = bkey_next(_k);
++
++		k = bkey_disassemble(b, _k, &uk);
++
++		printbuf_reset(&buf);
++		if (c)
++			bch2_bkey_val_to_text(&buf, c, k);
++		else
++			bch2_bkey_to_text(&buf, k.k);
++		printk(KERN_ERR "block %u key %5zu: %s\n", set,
++		       _k->_data - i->_data, buf.buf);
++
++		if (_n == vstruct_last(i))
++			continue;
++
++		n = bkey_unpack_key(b, _n);
++
++		if (bpos_cmp(n.p, k.k->p) < 0) {
++			printk(KERN_ERR "Key skipped backwards\n");
++			continue;
++		}
++
++		if (!bkey_deleted(k.k) &&
++		    !bpos_cmp(n.p, k.k->p))
++			printk(KERN_ERR "Duplicate keys\n");
++	}
++
++	printbuf_exit(&buf);
++}
++
++void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
++{
++	struct bset_tree *t;
++
++	console_lock();
++	for_each_bset(b, t)
++		bch2_dump_bset(c, b, bset(b, t), t - b->set);
++	console_unlock();
++}
++
++void bch2_dump_btree_node_iter(struct btree *b,
++			      struct btree_node_iter *iter)
++{
++	struct btree_node_iter_set *set;
++	struct printbuf buf = PRINTBUF;
++
++	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
++	       __btree_node_iter_used(iter), b->nsets);
++
++	btree_node_iter_for_each(iter, set) {
++		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
++		struct bset_tree *t = bch2_bkey_to_bset(b, k);
++		struct bkey uk = bkey_unpack_key(b, k);
++
++		printbuf_reset(&buf);
++		bch2_bkey_to_text(&buf, &uk);
++		printk(KERN_ERR "set %zu key %u: %s\n",
++		       t - b->set, set->k, buf.buf);
++	}
++
++	printbuf_exit(&buf);
++}
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++
++void __bch2_verify_btree_nr_keys(struct btree *b)
++{
++	struct bset_tree *t;
++	struct bkey_packed *k;
++	struct btree_nr_keys nr = { 0 };
++
++	for_each_bset(b, t)
++		bset_tree_for_each_key(b, t, k)
++			if (!bkey_deleted(k))
++				btree_keys_account_key_add(&nr, t - b->set, k);
++
++	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
++}
++
++static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
++					    struct btree *b)
++{
++	struct btree_node_iter iter = *_iter;
++	const struct bkey_packed *k, *n;
++
++	k = bch2_btree_node_iter_peek_all(&iter, b);
++	__bch2_btree_node_iter_advance(&iter, b);
++	n = bch2_btree_node_iter_peek_all(&iter, b);
++
++	bkey_unpack_key(b, k);
++
++	if (n &&
++	    bkey_iter_cmp(b, k, n) > 0) {
++		struct btree_node_iter_set *set;
++		struct bkey ku = bkey_unpack_key(b, k);
++		struct bkey nu = bkey_unpack_key(b, n);
++		struct printbuf buf1 = PRINTBUF;
++		struct printbuf buf2 = PRINTBUF;
++
++		bch2_dump_btree_node(NULL, b);
++		bch2_bkey_to_text(&buf1, &ku);
++		bch2_bkey_to_text(&buf2, &nu);
++		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
++		       buf1.buf, buf2.buf);
++		printk(KERN_ERR "iter was:");
++
++		btree_node_iter_for_each(_iter, set) {
++			struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
++			struct bset_tree *t = bch2_bkey_to_bset(b, k);
++			printk(" [%zi %zi]", t - b->set,
++			       k->_data - bset(b, t)->_data);
++		}
++		panic("\n");
++	}
++}
++
++void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
++				 struct btree *b)
++{
++	struct btree_node_iter_set *set, *s2;
++	struct bkey_packed *k, *p;
++	struct bset_tree *t;
++
++	if (bch2_btree_node_iter_end(iter))
++		return;
++
++	/* Verify no duplicates: */
++	btree_node_iter_for_each(iter, set) {
++		BUG_ON(set->k > set->end);
++		btree_node_iter_for_each(iter, s2)
++			BUG_ON(set != s2 && set->end == s2->end);
++	}
++
++	/* Verify that set->end is correct: */
++	btree_node_iter_for_each(iter, set) {
++		for_each_bset(b, t)
++			if (set->end == t->end_offset)
++				goto found;
++		BUG();
++found:
++		BUG_ON(set->k < btree_bkey_first_offset(t) ||
++		       set->k >= t->end_offset);
++	}
++
++	/* Verify iterator is sorted: */
++	btree_node_iter_for_each(iter, set)
++		BUG_ON(set != iter->data &&
++		       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
++
++	k = bch2_btree_node_iter_peek_all(iter, b);
++
++	for_each_bset(b, t) {
++		if (iter->data[0].end == t->end_offset)
++			continue;
++
++		p = bch2_bkey_prev_all(b, t,
++			bch2_btree_node_iter_bset_pos(iter, b, t));
++
++		BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
++	}
++}
++
++void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
++			    struct bkey_packed *insert, unsigned clobber_u64s)
++{
++	struct bset_tree *t = bch2_bkey_to_bset(b, where);
++	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
++	struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++#if 0
++	BUG_ON(prev &&
++	       bkey_iter_cmp(b, prev, insert) > 0);
++#else
++	if (prev &&
++	    bkey_iter_cmp(b, prev, insert) > 0) {
++		struct bkey k1 = bkey_unpack_key(b, prev);
++		struct bkey k2 = bkey_unpack_key(b, insert);
++
++		bch2_dump_btree_node(NULL, b);
++		bch2_bkey_to_text(&buf1, &k1);
++		bch2_bkey_to_text(&buf2, &k2);
++
++		panic("prev > insert:\n"
++		      "prev    key %s\n"
++		      "insert  key %s\n",
++		      buf1.buf, buf2.buf);
++	}
++#endif
++#if 0
++	BUG_ON(next != btree_bkey_last(b, t) &&
++	       bkey_iter_cmp(b, insert, next) > 0);
++#else
++	if (next != btree_bkey_last(b, t) &&
++	    bkey_iter_cmp(b, insert, next) > 0) {
++		struct bkey k1 = bkey_unpack_key(b, insert);
++		struct bkey k2 = bkey_unpack_key(b, next);
++
++		bch2_dump_btree_node(NULL, b);
++		bch2_bkey_to_text(&buf1, &k1);
++		bch2_bkey_to_text(&buf2, &k2);
++
++		panic("insert > next:\n"
++		      "insert  key %s\n"
++		      "next    key %s\n",
++		      buf1.buf, buf2.buf);
++	}
++#endif
++}
++
++#else
++
++static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
++						   struct btree *b) {}
++
++#endif
++
++/* Auxiliary search trees */
++
++#define BFLOAT_FAILED_UNPACKED	U8_MAX
++#define BFLOAT_FAILED		U8_MAX
++
++struct bkey_float {
++	u8		exponent;
++	u8		key_offset;
++	u16		mantissa;
++};
++#define BKEY_MANTISSA_BITS	16
++
++static unsigned bkey_float_byte_offset(unsigned idx)
++{
++	return idx * sizeof(struct bkey_float);
++}
++
++struct ro_aux_tree {
++	struct bkey_float	f[0];
++};
++
++struct rw_aux_tree {
++	u16		offset;
++	struct bpos	k;
++};
++
++static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
++{
++	BUG_ON(t->aux_data_offset == U16_MAX);
++
++	switch (bset_aux_tree_type(t)) {
++	case BSET_NO_AUX_TREE:
++		return t->aux_data_offset;
++	case BSET_RO_AUX_TREE:
++		return t->aux_data_offset +
++			DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
++				     t->size * sizeof(u8), 8);
++	case BSET_RW_AUX_TREE:
++		return t->aux_data_offset +
++			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
++	default:
++		BUG();
++	}
++}
++
++static unsigned bset_aux_tree_buf_start(const struct btree *b,
++					const struct bset_tree *t)
++{
++	return t == b->set
++		? DIV_ROUND_UP(b->unpack_fn_len, 8)
++		: bset_aux_tree_buf_end(t - 1);
++}
++
++static void *__aux_tree_base(const struct btree *b,
++			     const struct bset_tree *t)
++{
++	return b->aux_data + t->aux_data_offset * 8;
++}
++
++static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
++					    const struct bset_tree *t)
++{
++	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
++
++	return __aux_tree_base(b, t);
++}
++
++static u8 *ro_aux_tree_prev(const struct btree *b,
++			    const struct bset_tree *t)
++{
++	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
++
++	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
++}
++
++static struct bkey_float *bkey_float(const struct btree *b,
++				     const struct bset_tree *t,
++				     unsigned idx)
++{
++	return ro_aux_tree_base(b, t)->f + idx;
++}
++
++static void bset_aux_tree_verify(const struct btree *b)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	const struct bset_tree *t;
++
++	for_each_bset(b, t) {
++		if (t->aux_data_offset == U16_MAX)
++			continue;
++
++		BUG_ON(t != b->set &&
++		       t[-1].aux_data_offset == U16_MAX);
++
++		BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
++		BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
++		BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
++	}
++#endif
++}
++
++void bch2_btree_keys_init(struct btree *b)
++{
++	unsigned i;
++
++	b->nsets		= 0;
++	memset(&b->nr, 0, sizeof(b->nr));
++
++	for (i = 0; i < MAX_BSETS; i++)
++		b->set[i].data_offset = U16_MAX;
++
++	bch2_bset_set_no_aux_tree(b, b->set);
++}
++
++/* Binary tree stuff for auxiliary search trees */
++
++/*
++ * Cacheline/offset <-> bkey pointer arithmetic:
++ *
++ * t->tree is a binary search tree in an array; each node corresponds to a key
++ * in one cacheline in t->set (BSET_CACHELINE bytes).
++ *
++ * This means we don't have to store the full index of the key that a node in
++ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
++ * then bkey_float->m gives us the offset within that cacheline, in units of 8
++ * bytes.
++ *
++ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
++ * make this work.
++ *
++ * To construct the bfloat for an arbitrary key we need to know what the key
++ * immediately preceding it is: we have to check if the two keys differ in the
++ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
++ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
++ */
++
++static inline void *bset_cacheline(const struct btree *b,
++				   const struct bset_tree *t,
++				   unsigned cacheline)
++{
++	return (void *) round_down((unsigned long) btree_bkey_first(b, t),
++				   L1_CACHE_BYTES) +
++		cacheline * BSET_CACHELINE;
++}
++
++static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
++					     const struct bset_tree *t,
++					     unsigned cacheline,
++					     unsigned offset)
++{
++	return bset_cacheline(b, t, cacheline) + offset * 8;
++}
++
++static unsigned bkey_to_cacheline(const struct btree *b,
++				  const struct bset_tree *t,
++				  const struct bkey_packed *k)
++{
++	return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
++}
++
++static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
++					  const struct bset_tree *t,
++					  unsigned cacheline,
++					  const struct bkey_packed *k)
++{
++	return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
++}
++
++static unsigned bkey_to_cacheline_offset(const struct btree *b,
++					 const struct bset_tree *t,
++					 unsigned cacheline,
++					 const struct bkey_packed *k)
++{
++	size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
++
++	EBUG_ON(m > U8_MAX);
++	return m;
++}
++
++static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
++					       const struct bset_tree *t,
++					       unsigned j)
++{
++	return cacheline_to_bkey(b, t,
++			__eytzinger1_to_inorder(j, t->size - 1, t->extra),
++			bkey_float(b, t, j)->key_offset);
++}
++
++static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
++					     const struct bset_tree *t,
++					     unsigned j)
++{
++	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
++
++	return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
++}
++
++static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
++				       const struct bset_tree *t)
++{
++	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
++
++	return __aux_tree_base(b, t);
++}
++
++/*
++ * For the write set - the one we're currently inserting keys into - we don't
++ * maintain a full search tree, we just keep a simple lookup table in t->prev.
++ */
++static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
++					  struct bset_tree *t,
++					  unsigned j)
++{
++	return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
++}
++
++static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
++			    unsigned j, struct bkey_packed *k)
++{
++	EBUG_ON(k >= btree_bkey_last(b, t));
++
++	rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
++		.offset	= __btree_node_key_to_offset(b, k),
++		.k	= bkey_unpack_pos(b, k),
++	};
++}
++
++static void bch2_bset_verify_rw_aux_tree(struct btree *b,
++					struct bset_tree *t)
++{
++	struct bkey_packed *k = btree_bkey_first(b, t);
++	unsigned j = 0;
++
++	if (!bch2_expensive_debug_checks)
++		return;
++
++	BUG_ON(bset_has_ro_aux_tree(t));
++
++	if (!bset_has_rw_aux_tree(t))
++		return;
++
++	BUG_ON(t->size < 1);
++	BUG_ON(rw_aux_to_bkey(b, t, j) != k);
++
++	goto start;
++	while (1) {
++		if (rw_aux_to_bkey(b, t, j) == k) {
++			BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
++					bkey_unpack_pos(b, k)));
++start:
++			if (++j == t->size)
++				break;
++
++			BUG_ON(rw_aux_tree(b, t)[j].offset <=
++			       rw_aux_tree(b, t)[j - 1].offset);
++		}
++
++		k = bkey_next(k);
++		BUG_ON(k >= btree_bkey_last(b, t));
++	}
++}
++
++/* returns idx of first entry >= offset: */
++static unsigned rw_aux_tree_bsearch(struct btree *b,
++				    struct bset_tree *t,
++				    unsigned offset)
++{
++	unsigned bset_offs = offset - btree_bkey_first_offset(t);
++	unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
++	unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
++
++	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
++	EBUG_ON(!t->size);
++	EBUG_ON(idx > t->size);
++
++	while (idx < t->size &&
++	       rw_aux_tree(b, t)[idx].offset < offset)
++		idx++;
++
++	while (idx &&
++	       rw_aux_tree(b, t)[idx - 1].offset >= offset)
++		idx--;
++
++	EBUG_ON(idx < t->size &&
++		rw_aux_tree(b, t)[idx].offset < offset);
++	EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
++	EBUG_ON(idx + 1 < t->size &&
++		rw_aux_tree(b, t)[idx].offset ==
++		rw_aux_tree(b, t)[idx + 1].offset);
++
++	return idx;
++}
++
++static inline unsigned bkey_mantissa(const struct bkey_packed *k,
++				     const struct bkey_float *f,
++				     unsigned idx)
++{
++	u64 v;
++
++	EBUG_ON(!bkey_packed(k));
++
++	v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
++
++	/*
++	 * In little endian, we're shifting off low bits (and then the bits we
++	 * want are at the low end), in big endian we're shifting off high bits
++	 * (and then the bits we want are at the high end, so we shift them
++	 * back down):
++	 */
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++	v >>= f->exponent & 7;
++#else
++	v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
++#endif
++	return (u16) v;
++}
++
++__always_inline
++static inline void make_bfloat(struct btree *b, struct bset_tree *t,
++			       unsigned j,
++			       struct bkey_packed *min_key,
++			       struct bkey_packed *max_key)
++{
++	struct bkey_float *f = bkey_float(b, t, j);
++	struct bkey_packed *m = tree_to_bkey(b, t, j);
++	struct bkey_packed *l = is_power_of_2(j)
++		? min_key
++		: tree_to_prev_bkey(b, t, j >> ffs(j));
++	struct bkey_packed *r = is_power_of_2(j + 1)
++		? max_key
++		: tree_to_bkey(b, t, j >> (ffz(j) + 1));
++	unsigned mantissa;
++	int shift, exponent, high_bit;
++
++	/*
++	 * for failed bfloats, the lookup code falls back to comparing against
++	 * the original key.
++	 */
++
++	if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
++	    !b->nr_key_bits) {
++		f->exponent = BFLOAT_FAILED_UNPACKED;
++		return;
++	}
++
++	/*
++	 * The greatest differing bit of l and r is the first bit we must
++	 * include in the bfloat mantissa we're creating in order to do
++	 * comparisons - that bit always becomes the high bit of
++	 * bfloat->mantissa, and thus the exponent we're calculating here is
++	 * the position of what will become the low bit in bfloat->mantissa:
++	 *
++	 * Note that this may be negative - we may be running off the low end
++	 * of the key: we handle this later:
++	 */
++	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
++		       min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
++	exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
++
++	/*
++	 * Then we calculate the actual shift value, from the start of the key
++	 * (k->_data), to get the key bits starting at exponent:
++	 */
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
++
++	EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
++#else
++	shift = high_bit_offset +
++		b->nr_key_bits -
++		exponent -
++		BKEY_MANTISSA_BITS;
++
++	EBUG_ON(shift < KEY_PACKED_BITS_START);
++#endif
++	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
++
++	f->exponent = shift;
++	mantissa = bkey_mantissa(m, f, j);
++
++	/*
++	 * If we've got garbage bits, set them to all 1s - it's legal for the
++	 * bfloat to compare larger than the original key, but not smaller:
++	 */
++	if (exponent < 0)
++		mantissa |= ~(~0U << -exponent);
++
++	f->mantissa = mantissa;
++}
++
++/* bytes remaining - only valid for last bset: */
++static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
++{
++	bset_aux_tree_verify(b);
++
++	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
++}
++
++static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
++{
++	return __bset_tree_capacity(b, t) /
++		(sizeof(struct bkey_float) + sizeof(u8));
++}
++
++static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
++{
++	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
++}
++
++static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
++{
++	struct bkey_packed *k;
++
++	t->size = 1;
++	t->extra = BSET_RW_AUX_TREE_VAL;
++	rw_aux_tree(b, t)[0].offset =
++		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
++
++	bset_tree_for_each_key(b, t, k) {
++		if (t->size == bset_rw_tree_capacity(b, t))
++			break;
++
++		if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
++		    L1_CACHE_BYTES)
++			rw_aux_tree_set(b, t, t->size++, k);
++	}
++}
++
++static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
++{
++	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
++	struct bkey_i min_key, max_key;
++	unsigned j, cacheline = 1;
++
++	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
++		      bset_ro_tree_capacity(b, t));
++retry:
++	if (t->size < 2) {
++		t->size = 0;
++		t->extra = BSET_NO_AUX_TREE_VAL;
++		return;
++	}
++
++	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
++
++	/* First we figure out where the first key in each cacheline is */
++	eytzinger1_for_each(j, t->size - 1) {
++		while (bkey_to_cacheline(b, t, k) < cacheline)
++			prev = k, k = bkey_next(k);
++
++		if (k >= btree_bkey_last(b, t)) {
++			/* XXX: this path sucks */
++			t->size--;
++			goto retry;
++		}
++
++		ro_aux_tree_prev(b, t)[j] = prev->u64s;
++		bkey_float(b, t, j)->key_offset =
++			bkey_to_cacheline_offset(b, t, cacheline++, k);
++
++		EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
++		EBUG_ON(tree_to_bkey(b, t, j) != k);
++	}
++
++	while (k != btree_bkey_last(b, t))
++		prev = k, k = bkey_next(k);
++
++	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
++		bkey_init(&min_key.k);
++		min_key.k.p = b->data->min_key;
++	}
++
++	if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
++		bkey_init(&max_key.k);
++		max_key.k.p = b->data->max_key;
++	}
++
++	/* Then we build the tree */
++	eytzinger1_for_each(j, t->size - 1)
++		make_bfloat(b, t, j,
++			    bkey_to_packed(&min_key),
++			    bkey_to_packed(&max_key));
++}
++
++static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
++{
++	struct bset_tree *i;
++
++	for (i = b->set; i != t; i++)
++		BUG_ON(bset_has_rw_aux_tree(i));
++
++	bch2_bset_set_no_aux_tree(b, t);
++
++	/* round up to next cacheline: */
++	t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
++				      SMP_CACHE_BYTES / sizeof(u64));
++
++	bset_aux_tree_verify(b);
++}
++
++void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
++			     bool writeable)
++{
++	if (writeable
++	    ? bset_has_rw_aux_tree(t)
++	    : bset_has_ro_aux_tree(t))
++		return;
++
++	bset_alloc_tree(b, t);
++
++	if (!__bset_tree_capacity(b, t))
++		return;
++
++	if (writeable)
++		__build_rw_aux_tree(b, t);
++	else
++		__build_ro_aux_tree(b, t);
++
++	bset_aux_tree_verify(b);
++}
++
++void bch2_bset_init_first(struct btree *b, struct bset *i)
++{
++	struct bset_tree *t;
++
++	BUG_ON(b->nsets);
++
++	memset(i, 0, sizeof(*i));
++	get_random_bytes(&i->seq, sizeof(i->seq));
++	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
++
++	t = &b->set[b->nsets++];
++	set_btree_bset(b, t, i);
++}
++
++void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
++			 struct btree_node_entry *bne)
++{
++	struct bset *i = &bne->keys;
++	struct bset_tree *t;
++
++	BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
++	BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
++	BUG_ON(b->nsets >= MAX_BSETS);
++
++	memset(i, 0, sizeof(*i));
++	i->seq = btree_bset_first(b)->seq;
++	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
++
++	t = &b->set[b->nsets++];
++	set_btree_bset(b, t, i);
++}
++
++/*
++ * find _some_ key in the same bset as @k that precedes @k - not necessarily the
++ * immediate predecessor:
++ */
++static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
++				       struct bkey_packed *k)
++{
++	struct bkey_packed *p;
++	unsigned offset;
++	int j;
++
++	EBUG_ON(k < btree_bkey_first(b, t) ||
++		k > btree_bkey_last(b, t));
++
++	if (k == btree_bkey_first(b, t))
++		return NULL;
++
++	switch (bset_aux_tree_type(t)) {
++	case BSET_NO_AUX_TREE:
++		p = btree_bkey_first(b, t);
++		break;
++	case BSET_RO_AUX_TREE:
++		j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
++
++		do {
++			p = j ? tree_to_bkey(b, t,
++					__inorder_to_eytzinger1(j--,
++							t->size - 1, t->extra))
++			      : btree_bkey_first(b, t);
++		} while (p >= k);
++		break;
++	case BSET_RW_AUX_TREE:
++		offset = __btree_node_key_to_offset(b, k);
++		j = rw_aux_tree_bsearch(b, t, offset);
++		p = j ? rw_aux_to_bkey(b, t, j - 1)
++		      : btree_bkey_first(b, t);
++		break;
++	}
++
++	return p;
++}
++
++struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
++					  struct bset_tree *t,
++					  struct bkey_packed *k,
++					  unsigned min_key_type)
++{
++	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
++
++	while ((p = __bkey_prev(b, t, k)) && !ret) {
++		for (i = p; i != k; i = bkey_next(i))
++			if (i->type >= min_key_type)
++				ret = i;
++
++		k = p;
++	}
++
++	if (bch2_expensive_debug_checks) {
++		BUG_ON(ret >= orig_k);
++
++		for (i = ret
++			? bkey_next(ret)
++			: btree_bkey_first(b, t);
++		     i != orig_k;
++		     i = bkey_next(i))
++			BUG_ON(i->type >= min_key_type);
++	}
++
++	return ret;
++}
++
++/* Insert */
++
++static void bch2_bset_fix_lookup_table(struct btree *b,
++				       struct bset_tree *t,
++				       struct bkey_packed *_where,
++				       unsigned clobber_u64s,
++				       unsigned new_u64s)
++{
++	int shift = new_u64s - clobber_u64s;
++	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
++
++	EBUG_ON(bset_has_ro_aux_tree(t));
++
++	if (!bset_has_rw_aux_tree(t))
++		return;
++
++	/* returns first entry >= where */
++	l = rw_aux_tree_bsearch(b, t, where);
++
++	if (!l) /* never delete first entry */
++		l++;
++	else if (l < t->size &&
++		 where < t->end_offset &&
++		 rw_aux_tree(b, t)[l].offset == where)
++		rw_aux_tree_set(b, t, l++, _where);
++
++	/* l now > where */
++
++	for (j = l;
++	     j < t->size &&
++	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
++	     j++)
++		;
++
++	if (j < t->size &&
++	    rw_aux_tree(b, t)[j].offset + shift ==
++	    rw_aux_tree(b, t)[l - 1].offset)
++		j++;
++
++	memmove(&rw_aux_tree(b, t)[l],
++		&rw_aux_tree(b, t)[j],
++		(void *) &rw_aux_tree(b, t)[t->size] -
++		(void *) &rw_aux_tree(b, t)[j]);
++	t->size -= j - l;
++
++	for (j = l; j < t->size; j++)
++	       rw_aux_tree(b, t)[j].offset += shift;
++
++	EBUG_ON(l < t->size &&
++		rw_aux_tree(b, t)[l].offset ==
++		rw_aux_tree(b, t)[l - 1].offset);
++
++	if (t->size < bset_rw_tree_capacity(b, t) &&
++	    (l < t->size
++	     ? rw_aux_tree(b, t)[l].offset
++	     : t->end_offset) -
++	    rw_aux_tree(b, t)[l - 1].offset >
++	    L1_CACHE_BYTES / sizeof(u64)) {
++		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
++		struct bkey_packed *end = l < t->size
++			? rw_aux_to_bkey(b, t, l)
++			: btree_bkey_last(b, t);
++		struct bkey_packed *k = start;
++
++		while (1) {
++			k = bkey_next(k);
++			if (k == end)
++				break;
++
++			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
++				memmove(&rw_aux_tree(b, t)[l + 1],
++					&rw_aux_tree(b, t)[l],
++					(void *) &rw_aux_tree(b, t)[t->size] -
++					(void *) &rw_aux_tree(b, t)[l]);
++				t->size++;
++				rw_aux_tree_set(b, t, l, k);
++				break;
++			}
++		}
++	}
++
++	bch2_bset_verify_rw_aux_tree(b, t);
++	bset_aux_tree_verify(b);
++}
++
++void bch2_bset_insert(struct btree *b,
++		      struct btree_node_iter *iter,
++		      struct bkey_packed *where,
++		      struct bkey_i *insert,
++		      unsigned clobber_u64s)
++{
++	struct bkey_format *f = &b->format;
++	struct bset_tree *t = bset_tree_last(b);
++	struct bkey_packed packed, *src = bkey_to_packed(insert);
++
++	bch2_bset_verify_rw_aux_tree(b, t);
++	bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
++
++	if (bch2_bkey_pack_key(&packed, &insert->k, f))
++		src = &packed;
++
++	if (!bkey_deleted(&insert->k))
++		btree_keys_account_key_add(&b->nr, t - b->set, src);
++
++	if (src->u64s != clobber_u64s) {
++		u64 *src_p = where->_data + clobber_u64s;
++		u64 *dst_p = where->_data + src->u64s;
++
++		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
++			(int) clobber_u64s - src->u64s);
++
++		memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
++		le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
++		set_btree_bset_end(b, t);
++	}
++
++	memcpy_u64s(where, src,
++		    bkeyp_key_u64s(f, src));
++	memcpy_u64s(bkeyp_val(f, where), &insert->v,
++		    bkeyp_val_u64s(f, src));
++
++	if (src->u64s != clobber_u64s)
++		bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
++
++	bch2_verify_btree_nr_keys(b);
++}
++
++void bch2_bset_delete(struct btree *b,
++		      struct bkey_packed *where,
++		      unsigned clobber_u64s)
++{
++	struct bset_tree *t = bset_tree_last(b);
++	u64 *src_p = where->_data + clobber_u64s;
++	u64 *dst_p = where->_data;
++
++	bch2_bset_verify_rw_aux_tree(b, t);
++
++	EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
++
++	memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
++	le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
++	set_btree_bset_end(b, t);
++
++	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
++}
++
++/* Lookup */
++
++__flatten
++static struct bkey_packed *bset_search_write_set(const struct btree *b,
++				struct bset_tree *t,
++				struct bpos *search)
++{
++	unsigned l = 0, r = t->size;
++
++	while (l + 1 != r) {
++		unsigned m = (l + r) >> 1;
++
++		if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
++			l = m;
++		else
++			r = m;
++	}
++
++	return rw_aux_to_bkey(b, t, l);
++}
++
++static inline void prefetch_four_cachelines(void *p)
++{
++#ifdef CONFIG_X86_64
++	asm("prefetcht0 (-127 + 64 * 0)(%0);"
++	    "prefetcht0 (-127 + 64 * 1)(%0);"
++	    "prefetcht0 (-127 + 64 * 2)(%0);"
++	    "prefetcht0 (-127 + 64 * 3)(%0);"
++	    :
++	    : "r" (p + 127));
++#else
++	prefetch(p + L1_CACHE_BYTES * 0);
++	prefetch(p + L1_CACHE_BYTES * 1);
++	prefetch(p + L1_CACHE_BYTES * 2);
++	prefetch(p + L1_CACHE_BYTES * 3);
++#endif
++}
++
++static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
++					      const struct bkey_float *f,
++					      unsigned idx)
++{
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++	unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
++
++	return f->exponent > key_bits_start;
++#else
++	unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
++
++	return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
++#endif
++}
++
++__flatten
++static struct bkey_packed *bset_search_tree(const struct btree *b,
++				const struct bset_tree *t,
++				const struct bpos *search,
++				const struct bkey_packed *packed_search)
++{
++	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
++	struct bkey_float *f;
++	struct bkey_packed *k;
++	unsigned inorder, n = 1, l, r;
++	int cmp;
++
++	do {
++		if (likely(n << 4 < t->size))
++			prefetch(&base->f[n << 4]);
++
++		f = &base->f[n];
++		if (unlikely(f->exponent >= BFLOAT_FAILED))
++			goto slowpath;
++
++		l = f->mantissa;
++		r = bkey_mantissa(packed_search, f, n);
++
++		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
++			goto slowpath;
++
++		n = n * 2 + (l < r);
++		continue;
++slowpath:
++		k = tree_to_bkey(b, t, n);
++		cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
++		if (!cmp)
++			return k;
++
++		n = n * 2 + (cmp < 0);
++	} while (n < t->size);
++
++	inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
++
++	/*
++	 * n would have been the node we recursed to - the low bit tells us if
++	 * we recursed left or recursed right.
++	 */
++	if (likely(!(n & 1))) {
++		--inorder;
++		if (unlikely(!inorder))
++			return btree_bkey_first(b, t);
++
++		f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
++	}
++
++	return cacheline_to_bkey(b, t, inorder, f->key_offset);
++}
++
++static __always_inline __flatten
++struct bkey_packed *__bch2_bset_search(struct btree *b,
++				struct bset_tree *t,
++				struct bpos *search,
++				const struct bkey_packed *lossy_packed_search)
++{
++
++	/*
++	 * First, we search for a cacheline, then lastly we do a linear search
++	 * within that cacheline.
++	 *
++	 * To search for the cacheline, there's three different possibilities:
++	 *  * The set is too small to have a search tree, so we just do a linear
++	 *    search over the whole set.
++	 *  * The set is the one we're currently inserting into; keeping a full
++	 *    auxiliary search tree up to date would be too expensive, so we
++	 *    use a much simpler lookup table to do a binary search -
++	 *    bset_search_write_set().
++	 *  * Or we use the auxiliary search tree we constructed earlier -
++	 *    bset_search_tree()
++	 */
++
++	switch (bset_aux_tree_type(t)) {
++	case BSET_NO_AUX_TREE:
++		return btree_bkey_first(b, t);
++	case BSET_RW_AUX_TREE:
++		return bset_search_write_set(b, t, search);
++	case BSET_RO_AUX_TREE:
++		return bset_search_tree(b, t, search, lossy_packed_search);
++	default:
++		unreachable();
++	}
++}
++
++static __always_inline __flatten
++struct bkey_packed *bch2_bset_search_linear(struct btree *b,
++				struct bset_tree *t,
++				struct bpos *search,
++				struct bkey_packed *packed_search,
++				const struct bkey_packed *lossy_packed_search,
++				struct bkey_packed *m)
++{
++	if (lossy_packed_search)
++		while (m != btree_bkey_last(b, t) &&
++		       bkey_iter_cmp_p_or_unp(b, m,
++					lossy_packed_search, search) < 0)
++			m = bkey_next(m);
++
++	if (!packed_search)
++		while (m != btree_bkey_last(b, t) &&
++		       bkey_iter_pos_cmp(b, m, search) < 0)
++			m = bkey_next(m);
++
++	if (bch2_expensive_debug_checks) {
++		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
++
++		BUG_ON(prev &&
++		       bkey_iter_cmp_p_or_unp(b, prev,
++					packed_search, search) >= 0);
++	}
++
++	return m;
++}
++
++/* Btree node iterator */
++
++static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
++			      struct btree *b,
++			      const struct bkey_packed *k,
++			      const struct bkey_packed *end)
++{
++	if (k != end) {
++		struct btree_node_iter_set *pos;
++
++		btree_node_iter_for_each(iter, pos)
++			;
++
++		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
++		*pos = (struct btree_node_iter_set) {
++			__btree_node_key_to_offset(b, k),
++			__btree_node_key_to_offset(b, end)
++		};
++	}
++}
++
++void bch2_btree_node_iter_push(struct btree_node_iter *iter,
++			       struct btree *b,
++			       const struct bkey_packed *k,
++			       const struct bkey_packed *end)
++{
++	__bch2_btree_node_iter_push(iter, b, k, end);
++	bch2_btree_node_iter_sort(iter, b);
++}
++
++noinline __flatten __attribute__((cold))
++static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
++			      struct btree *b, struct bpos *search)
++{
++	struct bkey_packed *k;
++
++	trace_bkey_pack_pos_fail(search);
++
++	bch2_btree_node_iter_init_from_start(iter, b);
++
++	while ((k = bch2_btree_node_iter_peek(iter, b)) &&
++	       bkey_iter_pos_cmp(b, k, search) < 0)
++		bch2_btree_node_iter_advance(iter, b);
++}
++
++/**
++ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
++ * given position
++ *
++ * Main entry point to the lookup code for individual btree nodes:
++ *
++ * NOTE:
++ *
++ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
++ * keys. This doesn't matter for most code, but it does matter for lookups.
++ *
++ * Some adjacent keys with a string of equal keys:
++ *	i j k k k k l m
++ *
++ * If you search for k, the lookup code isn't guaranteed to return you any
++ * specific k. The lookup code is conceptually doing a binary search and
++ * iterating backwards is very expensive so if the pivot happens to land at the
++ * last k that's what you'll get.
++ *
++ * This works out ok, but it's something to be aware of:
++ *
++ *  - For non extents, we guarantee that the live key comes last - see
++ *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
++ *    see will only be deleted keys you don't care about.
++ *
++ *  - For extents, deleted keys sort last (see the comment at the top of this
++ *    file). But when you're searching for extents, you actually want the first
++ *    key strictly greater than your search key - an extent that compares equal
++ *    to the search key is going to have 0 sectors after the search key.
++ *
++ *    But this does mean that we can't just search for
++ *    bpos_successor(start_of_range) to get the first extent that overlaps with
++ *    the range we want - if we're unlucky and there's an extent that ends
++ *    exactly where we searched, then there could be a deleted key at the same
++ *    position and we'd get that when we search instead of the preceding extent
++ *    we needed.
++ *
++ *    So we've got to search for start_of_range, then after the lookup iterate
++ *    past any extents that compare equal to the position we searched for.
++ */
++__flatten
++void bch2_btree_node_iter_init(struct btree_node_iter *iter,
++			       struct btree *b, struct bpos *search)
++{
++	struct bkey_packed p, *packed_search = NULL;
++	struct btree_node_iter_set *pos = iter->data;
++	struct bkey_packed *k[MAX_BSETS];
++	unsigned i;
++
++	EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
++	EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0);
++	bset_aux_tree_verify(b);
++
++	memset(iter, 0, sizeof(*iter));
++
++	switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
++	case BKEY_PACK_POS_EXACT:
++		packed_search = &p;
++		break;
++	case BKEY_PACK_POS_SMALLER:
++		packed_search = NULL;
++		break;
++	case BKEY_PACK_POS_FAIL:
++		btree_node_iter_init_pack_failed(iter, b, search);
++		return;
++	}
++
++	for (i = 0; i < b->nsets; i++) {
++		k[i] = __bch2_bset_search(b, b->set + i, search, &p);
++		prefetch_four_cachelines(k[i]);
++	}
++
++	for (i = 0; i < b->nsets; i++) {
++		struct bset_tree *t = b->set + i;
++		struct bkey_packed *end = btree_bkey_last(b, t);
++
++		k[i] = bch2_bset_search_linear(b, t, search,
++					       packed_search, &p, k[i]);
++		if (k[i] != end)
++			*pos++ = (struct btree_node_iter_set) {
++				__btree_node_key_to_offset(b, k[i]),
++				__btree_node_key_to_offset(b, end)
++			};
++	}
++
++	bch2_btree_node_iter_sort(iter, b);
++}
++
++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
++					  struct btree *b)
++{
++	struct bset_tree *t;
++
++	memset(iter, 0, sizeof(*iter));
++
++	for_each_bset(b, t)
++		__bch2_btree_node_iter_push(iter, b,
++					   btree_bkey_first(b, t),
++					   btree_bkey_last(b, t));
++	bch2_btree_node_iter_sort(iter, b);
++}
++
++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
++						  struct btree *b,
++						  struct bset_tree *t)
++{
++	struct btree_node_iter_set *set;
++
++	btree_node_iter_for_each(iter, set)
++		if (set->end == t->end_offset)
++			return __btree_node_offset_to_key(b, set->k);
++
++	return btree_bkey_last(b, t);
++}
++
++static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
++					    struct btree *b,
++					    unsigned first)
++{
++	bool ret;
++
++	if ((ret = (btree_node_iter_cmp(b,
++					iter->data[first],
++					iter->data[first + 1]) > 0)))
++		swap(iter->data[first], iter->data[first + 1]);
++	return ret;
++}
++
++void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
++			       struct btree *b)
++{
++	/* unrolled bubble sort: */
++
++	if (!__btree_node_iter_set_end(iter, 2)) {
++		btree_node_iter_sort_two(iter, b, 0);
++		btree_node_iter_sort_two(iter, b, 1);
++	}
++
++	if (!__btree_node_iter_set_end(iter, 1))
++		btree_node_iter_sort_two(iter, b, 0);
++}
++
++void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
++				   struct btree_node_iter_set *set)
++{
++	struct btree_node_iter_set *last =
++		iter->data + ARRAY_SIZE(iter->data) - 1;
++
++	memmove(&set[0], &set[1], (void *) last - (void *) set);
++	*last = (struct btree_node_iter_set) { 0, 0 };
++}
++
++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
++						  struct btree *b)
++{
++	iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
++
++	EBUG_ON(iter->data->k > iter->data->end);
++
++	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
++		bch2_btree_node_iter_set_drop(iter, iter->data);
++		return;
++	}
++
++	if (__btree_node_iter_set_end(iter, 1))
++		return;
++
++	if (!btree_node_iter_sort_two(iter, b, 0))
++		return;
++
++	if (__btree_node_iter_set_end(iter, 2))
++		return;
++
++	btree_node_iter_sort_two(iter, b, 1);
++}
++
++void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
++				  struct btree *b)
++{
++	if (bch2_expensive_debug_checks) {
++		bch2_btree_node_iter_verify(iter, b);
++		bch2_btree_node_iter_next_check(iter, b);
++	}
++
++	__bch2_btree_node_iter_advance(iter, b);
++}
++
++/*
++ * Expensive:
++ */
++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
++						  struct btree *b)
++{
++	struct bkey_packed *k, *prev = NULL;
++	struct btree_node_iter_set *set;
++	struct bset_tree *t;
++	unsigned end = 0;
++
++	if (bch2_expensive_debug_checks)
++		bch2_btree_node_iter_verify(iter, b);
++
++	for_each_bset(b, t) {
++		k = bch2_bkey_prev_all(b, t,
++			bch2_btree_node_iter_bset_pos(iter, b, t));
++		if (k &&
++		    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
++			prev = k;
++			end = t->end_offset;
++		}
++	}
++
++	if (!prev)
++		return NULL;
++
++	/*
++	 * We're manually memmoving instead of just calling sort() to ensure the
++	 * prev we picked ends up in slot 0 - sort won't necessarily put it
++	 * there because of duplicate deleted keys:
++	 */
++	btree_node_iter_for_each(iter, set)
++		if (set->end == end)
++			goto found;
++
++	BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
++found:
++	BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
++
++	memmove(&iter->data[1],
++		&iter->data[0],
++		(void *) set - (void *) &iter->data[0]);
++
++	iter->data[0].k = __btree_node_key_to_offset(b, prev);
++	iter->data[0].end = end;
++
++	if (bch2_expensive_debug_checks)
++		bch2_btree_node_iter_verify(iter, b);
++	return prev;
++}
++
++struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
++					      struct btree *b)
++{
++	struct bkey_packed *prev;
++
++	do {
++		prev = bch2_btree_node_iter_prev_all(iter, b);
++	} while (prev && bkey_deleted(prev));
++
++	return prev;
++}
++
++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
++						 struct btree *b,
++						 struct bkey *u)
++{
++	struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
++
++	return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
++}
++
++/* Mergesort */
++
++void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
++{
++	struct bset_tree *t;
++
++	for_each_bset(b, t) {
++		enum bset_aux_tree_type type = bset_aux_tree_type(t);
++		size_t j;
++
++		stats->sets[type].nr++;
++		stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
++			sizeof(u64);
++
++		if (bset_has_ro_aux_tree(t)) {
++			stats->floats += t->size - 1;
++
++			for (j = 1; j < t->size; j++)
++				stats->failed +=
++					bkey_float(b, t, j)->exponent ==
++					BFLOAT_FAILED;
++		}
++	}
++}
++
++void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
++			 struct bkey_packed *k)
++{
++	struct bset_tree *t = bch2_bkey_to_bset(b, k);
++	struct bkey uk;
++	unsigned j, inorder;
++
++	if (!bset_has_ro_aux_tree(t))
++		return;
++
++	inorder = bkey_to_cacheline(b, t, k);
++	if (!inorder || inorder >= t->size)
++		return;
++
++	j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
++	if (k != tree_to_bkey(b, t, j))
++		return;
++
++	switch (bkey_float(b, t, j)->exponent) {
++	case BFLOAT_FAILED:
++		uk = bkey_unpack_key(b, k);
++		prt_printf(out,
++		       "    failed unpacked at depth %u\n"
++		       "\t",
++		       ilog2(j));
++		bch2_bpos_to_text(out, uk.p);
++		prt_printf(out, "\n");
++		break;
++	}
++}
+diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
+new file mode 100644
+index 000000000000..0d46534c3dcd
+--- /dev/null
++++ b/fs/bcachefs/bset.h
+@@ -0,0 +1,615 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BSET_H
++#define _BCACHEFS_BSET_H
++
++#include <linux/kernel.h>
++#include <linux/types.h>
++
++#include "bcachefs.h"
++#include "bkey.h"
++#include "bkey_methods.h"
++#include "btree_types.h"
++#include "util.h" /* for time_stats */
++#include "vstructs.h"
++
++/*
++ * BKEYS:
++ *
++ * A bkey contains a key, a size field, a variable number of pointers, and some
++ * ancillary flag bits.
++ *
++ * We use two different functions for validating bkeys, bkey_invalid and
++ * bkey_deleted().
++ *
++ * The one exception to the rule that ptr_invalid() filters out invalid keys is
++ * that it also filters out keys of size 0 - these are keys that have been
++ * completely overwritten. It'd be safe to delete these in memory while leaving
++ * them on disk, just unnecessary work - so we filter them out when resorting
++ * instead.
++ *
++ * We can't filter out stale keys when we're resorting, because garbage
++ * collection needs to find them to ensure bucket gens don't wrap around -
++ * unless we're rewriting the btree node those stale keys still exist on disk.
++ *
++ * We also implement functions here for removing some number of sectors from the
++ * front or the back of a bkey - this is mainly used for fixing overlapping
++ * extents, by removing the overlapping sectors from the older key.
++ *
++ * BSETS:
++ *
++ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
++ * along with a header. A btree node is made up of a number of these, written at
++ * different times.
++ *
++ * There could be many of them on disk, but we never allow there to be more than
++ * 4 in memory - we lazily resort as needed.
++ *
++ * We implement code here for creating and maintaining auxiliary search trees
++ * (described below) for searching an individial bset, and on top of that we
++ * implement a btree iterator.
++ *
++ * BTREE ITERATOR:
++ *
++ * Most of the code in bcache doesn't care about an individual bset - it needs
++ * to search entire btree nodes and iterate over them in sorted order.
++ *
++ * The btree iterator code serves both functions; it iterates through the keys
++ * in a btree node in sorted order, starting from either keys after a specific
++ * point (if you pass it a search key) or the start of the btree node.
++ *
++ * AUXILIARY SEARCH TREES:
++ *
++ * Since keys are variable length, we can't use a binary search on a bset - we
++ * wouldn't be able to find the start of the next key. But binary searches are
++ * slow anyways, due to terrible cache behaviour; bcache originally used binary
++ * searches and that code topped out at under 50k lookups/second.
++ *
++ * So we need to construct some sort of lookup table. Since we only insert keys
++ * into the last (unwritten) set, most of the keys within a given btree node are
++ * usually in sets that are mostly constant. We use two different types of
++ * lookup tables to take advantage of this.
++ *
++ * Both lookup tables share in common that they don't index every key in the
++ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
++ * is used for the rest.
++ *
++ * For sets that have been written to disk and are no longer being inserted
++ * into, we construct a binary search tree in an array - traversing a binary
++ * search tree in an array gives excellent locality of reference and is very
++ * fast, since both children of any node are adjacent to each other in memory
++ * (and their grandchildren, and great grandchildren...) - this means
++ * prefetching can be used to great effect.
++ *
++ * It's quite useful performance wise to keep these nodes small - not just
++ * because they're more likely to be in L2, but also because we can prefetch
++ * more nodes on a single cacheline and thus prefetch more iterations in advance
++ * when traversing this tree.
++ *
++ * Nodes in the auxiliary search tree must contain both a key to compare against
++ * (we don't want to fetch the key from the set, that would defeat the purpose),
++ * and a pointer to the key. We use a few tricks to compress both of these.
++ *
++ * To compress the pointer, we take advantage of the fact that one node in the
++ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
++ * a function (to_inorder()) that takes the index of a node in a binary tree and
++ * returns what its index would be in an inorder traversal, so we only have to
++ * store the low bits of the offset.
++ *
++ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
++ * compress that,  we take advantage of the fact that when we're traversing the
++ * search tree at every iteration we know that both our search key and the key
++ * we're looking for lie within some range - bounded by our previous
++ * comparisons. (We special case the start of a search so that this is true even
++ * at the root of the tree).
++ *
++ * So we know the key we're looking for is between a and b, and a and b don't
++ * differ higher than bit 50, we don't need to check anything higher than bit
++ * 50.
++ *
++ * We don't usually need the rest of the bits, either; we only need enough bits
++ * to partition the key range we're currently checking.  Consider key n - the
++ * key our auxiliary search tree node corresponds to, and key p, the key
++ * immediately preceding n.  The lowest bit we need to store in the auxiliary
++ * search tree is the highest bit that differs between n and p.
++ *
++ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
++ * comparison. But we'd really like our nodes in the auxiliary search tree to be
++ * of fixed size.
++ *
++ * The solution is to make them fixed size, and when we're constructing a node
++ * check if p and n differed in the bits we needed them to. If they don't we
++ * flag that node, and when doing lookups we fallback to comparing against the
++ * real key. As long as this doesn't happen to often (and it seems to reliably
++ * happen a bit less than 1% of the time), we win - even on failures, that key
++ * is then more likely to be in cache than if we were doing binary searches all
++ * the way, since we're touching so much less memory.
++ *
++ * The keys in the auxiliary search tree are stored in (software) floating
++ * point, with an exponent and a mantissa. The exponent needs to be big enough
++ * to address all the bits in the original key, but the number of bits in the
++ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
++ *
++ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
++ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
++ * We need one node per 128 bytes in the btree node, which means the auxiliary
++ * search trees take up 3% as much memory as the btree itself.
++ *
++ * Constructing these auxiliary search trees is moderately expensive, and we
++ * don't want to be constantly rebuilding the search tree for the last set
++ * whenever we insert another key into it. For the unwritten set, we use a much
++ * simpler lookup table - it's just a flat array, so index i in the lookup table
++ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
++ * within each byte range works the same as with the auxiliary search trees.
++ *
++ * These are much easier to keep up to date when we insert a key - we do it
++ * somewhat lazily; when we shift a key up we usually just increment the pointer
++ * to it, only when it would overflow do we go to the trouble of finding the
++ * first key in that range of bytes again.
++ */
++
++enum bset_aux_tree_type {
++	BSET_NO_AUX_TREE,
++	BSET_RO_AUX_TREE,
++	BSET_RW_AUX_TREE,
++};
++
++#define BSET_TREE_NR_TYPES	3
++
++#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
++#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
++
++static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
++{
++	switch (t->extra) {
++	case BSET_NO_AUX_TREE_VAL:
++		EBUG_ON(t->size);
++		return BSET_NO_AUX_TREE;
++	case BSET_RW_AUX_TREE_VAL:
++		EBUG_ON(!t->size);
++		return BSET_RW_AUX_TREE;
++	default:
++		EBUG_ON(!t->size);
++		return BSET_RO_AUX_TREE;
++	}
++}
++
++/*
++ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
++ * it used to be 64, but I realized the lookup code would touch slightly less
++ * memory if it was 128.
++ *
++ * It definites the number of bytes (in struct bset) per struct bkey_float in
++ * the auxiliar search tree - when we're done searching the bset_float tree we
++ * have this many bytes left that we do a linear search over.
++ *
++ * Since (after level 5) every level of the bset_tree is on a new cacheline,
++ * we're touching one fewer cacheline in the bset tree in exchange for one more
++ * cacheline in the linear search - but the linear search might stop before it
++ * gets to the second cacheline.
++ */
++
++#define BSET_CACHELINE		256
++
++static inline size_t btree_keys_cachelines(const struct btree *b)
++{
++	return (1U << b->byte_order) / BSET_CACHELINE;
++}
++
++static inline size_t btree_aux_data_bytes(const struct btree *b)
++{
++	return btree_keys_cachelines(b) * 8;
++}
++
++static inline size_t btree_aux_data_u64s(const struct btree *b)
++{
++	return btree_aux_data_bytes(b) / sizeof(u64);
++}
++
++typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
++
++static inline void
++__bkey_unpack_key_format_checked(const struct btree *b,
++			       struct bkey *dst,
++			       const struct bkey_packed *src)
++{
++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
++	{
++		compiled_unpack_fn unpack_fn = b->aux_data;
++		unpack_fn(dst, src);
++
++		if (bch2_expensive_debug_checks) {
++			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
++
++			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
++		}
++	}
++#else
++	*dst = __bch2_bkey_unpack_key(&b->format, src);
++#endif
++}
++
++static inline struct bkey
++bkey_unpack_key_format_checked(const struct btree *b,
++			       const struct bkey_packed *src)
++{
++	struct bkey dst;
++
++	__bkey_unpack_key_format_checked(b, &dst, src);
++	return dst;
++}
++
++static inline void __bkey_unpack_key(const struct btree *b,
++				     struct bkey *dst,
++				     const struct bkey_packed *src)
++{
++	if (likely(bkey_packed(src)))
++		__bkey_unpack_key_format_checked(b, dst, src);
++	else
++		*dst = *packed_to_bkey_c(src);
++}
++
++/**
++ * bkey_unpack_key -- unpack just the key, not the value
++ */
++static inline struct bkey bkey_unpack_key(const struct btree *b,
++					  const struct bkey_packed *src)
++{
++	return likely(bkey_packed(src))
++		? bkey_unpack_key_format_checked(b, src)
++		: *packed_to_bkey_c(src);
++}
++
++static inline struct bpos
++bkey_unpack_pos_format_checked(const struct btree *b,
++			       const struct bkey_packed *src)
++{
++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
++	return bkey_unpack_key_format_checked(b, src).p;
++#else
++	return __bkey_unpack_pos(&b->format, src);
++#endif
++}
++
++static inline struct bpos bkey_unpack_pos(const struct btree *b,
++					  const struct bkey_packed *src)
++{
++	return likely(bkey_packed(src))
++		? bkey_unpack_pos_format_checked(b, src)
++		: packed_to_bkey_c(src)->p;
++}
++
++/* Disassembled bkeys */
++
++static inline struct bkey_s_c bkey_disassemble(struct btree *b,
++					       const struct bkey_packed *k,
++					       struct bkey *u)
++{
++	__bkey_unpack_key(b, u, k);
++
++	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
++}
++
++/* non const version: */
++static inline struct bkey_s __bkey_disassemble(struct btree *b,
++					       struct bkey_packed *k,
++					       struct bkey *u)
++{
++	__bkey_unpack_key(b, u, k);
++
++	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
++}
++
++#define for_each_bset(_b, _t)						\
++	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
++
++#define bset_tree_for_each_key(_b, _t, _k)				\
++	for (_k = btree_bkey_first(_b, _t);				\
++	     _k != btree_bkey_last(_b, _t);				\
++	     _k = bkey_next(_k))
++
++static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
++{
++	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
++}
++
++static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
++{
++	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
++}
++
++static inline void bch2_bset_set_no_aux_tree(struct btree *b,
++					    struct bset_tree *t)
++{
++	BUG_ON(t < b->set);
++
++	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
++		t->size = 0;
++		t->extra = BSET_NO_AUX_TREE_VAL;
++		t->aux_data_offset = U16_MAX;
++	}
++}
++
++static inline void btree_node_set_format(struct btree *b,
++					 struct bkey_format f)
++{
++	int len;
++
++	b->format	= f;
++	b->nr_key_bits	= bkey_format_key_bits(&f);
++
++	len = bch2_compile_bkey_format(&b->format, b->aux_data);
++	BUG_ON(len < 0 || len > U8_MAX);
++
++	b->unpack_fn_len = len;
++
++	bch2_bset_set_no_aux_tree(b, b->set);
++}
++
++static inline struct bset *bset_next_set(struct btree *b,
++					 unsigned block_bytes)
++{
++	struct bset *i = btree_bset_last(b);
++
++	EBUG_ON(!is_power_of_2(block_bytes));
++
++	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
++}
++
++void bch2_btree_keys_init(struct btree *);
++
++void bch2_bset_init_first(struct btree *, struct bset *);
++void bch2_bset_init_next(struct bch_fs *, struct btree *,
++			 struct btree_node_entry *);
++void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
++
++void bch2_bset_insert(struct btree *, struct btree_node_iter *,
++		     struct bkey_packed *, struct bkey_i *, unsigned);
++void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
++
++/* Bkey utility code */
++
++/* packed or unpacked */
++static inline int bkey_cmp_p_or_unp(const struct btree *b,
++				    const struct bkey_packed *l,
++				    const struct bkey_packed *r_packed,
++				    const struct bpos *r)
++{
++	EBUG_ON(r_packed && !bkey_packed(r_packed));
++
++	if (unlikely(!bkey_packed(l)))
++		return bpos_cmp(packed_to_bkey_c(l)->p, *r);
++
++	if (likely(r_packed))
++		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
++
++	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
++}
++
++struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
++
++struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
++					  struct bkey_packed *, unsigned);
++
++static inline struct bkey_packed *
++bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
++{
++	return bch2_bkey_prev_filter(b, t, k, 0);
++}
++
++static inline struct bkey_packed *
++bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
++{
++	return bch2_bkey_prev_filter(b, t, k, 1);
++}
++
++/* Btree key iteration */
++
++void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
++			      const struct bkey_packed *,
++			      const struct bkey_packed *);
++void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
++			       struct bpos *);
++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
++					  struct btree *);
++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
++						 struct btree *,
++						 struct bset_tree *);
++
++void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
++void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
++				   struct btree_node_iter_set *);
++void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
++
++#define btree_node_iter_for_each(_iter, _set)				\
++	for (_set = (_iter)->data;					\
++	     _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&	\
++	     (_set)->k != (_set)->end;					\
++	     _set++)
++
++static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
++					     unsigned i)
++{
++	return iter->data[i].k == iter->data[i].end;
++}
++
++static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
++{
++	return __btree_node_iter_set_end(iter, 0);
++}
++
++/*
++ * When keys compare equal, deleted keys compare first:
++ *
++ * XXX: only need to compare pointers for keys that are both within a
++ * btree_node_iterator - we need to break ties for prev() to work correctly
++ */
++static inline int bkey_iter_cmp(const struct btree *b,
++				const struct bkey_packed *l,
++				const struct bkey_packed *r)
++{
++	return bch2_bkey_cmp_packed(b, l, r)
++		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
++		?: cmp_int(l, r);
++}
++
++static inline int btree_node_iter_cmp(const struct btree *b,
++				      struct btree_node_iter_set l,
++				      struct btree_node_iter_set r)
++{
++	return bkey_iter_cmp(b,
++			__btree_node_offset_to_key(b, l.k),
++			__btree_node_offset_to_key(b, r.k));
++}
++
++/* These assume r (the search key) is not a deleted key: */
++static inline int bkey_iter_pos_cmp(const struct btree *b,
++			const struct bkey_packed *l,
++			const struct bpos *r)
++{
++	return bkey_cmp_left_packed(b, l, r)
++		?: -((int) bkey_deleted(l));
++}
++
++static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
++				    const struct bkey_packed *l,
++				    const struct bkey_packed *r_packed,
++				    const struct bpos *r)
++{
++	return bkey_cmp_p_or_unp(b, l, r_packed, r)
++		?: -((int) bkey_deleted(l));
++}
++
++static inline struct bkey_packed *
++__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
++				struct btree *b)
++{
++	return __btree_node_offset_to_key(b, iter->data->k);
++}
++
++static inline struct bkey_packed *
++bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
++{
++	return !bch2_btree_node_iter_end(iter)
++		? __btree_node_offset_to_key(b, iter->data->k)
++		: NULL;
++}
++
++static inline struct bkey_packed *
++bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
++{
++	struct bkey_packed *k;
++
++	while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
++	       bkey_deleted(k))
++		bch2_btree_node_iter_advance(iter, b);
++
++	return k;
++}
++
++static inline struct bkey_packed *
++bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
++{
++	struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
++
++	if (ret)
++		bch2_btree_node_iter_advance(iter, b);
++
++	return ret;
++}
++
++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
++						  struct btree *);
++struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
++					      struct btree *);
++
++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
++						struct btree *,
++						struct bkey *);
++
++#define for_each_btree_node_key_unpack(b, k, iter, unpacked)		\
++	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
++	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
++	     bch2_btree_node_iter_advance(iter, b))
++
++/* Accounting: */
++
++static inline void btree_keys_account_key(struct btree_nr_keys *n,
++					  unsigned bset,
++					  struct bkey_packed *k,
++					  int sign)
++{
++	n->live_u64s		+= k->u64s * sign;
++	n->bset_u64s[bset]	+= k->u64s * sign;
++
++	if (bkey_packed(k))
++		n->packed_keys	+= sign;
++	else
++		n->unpacked_keys += sign;
++}
++
++static inline void btree_keys_account_val_delta(struct btree *b,
++						struct bkey_packed *k,
++						int delta)
++{
++	struct bset_tree *t = bch2_bkey_to_bset(b, k);
++
++	b->nr.live_u64s			+= delta;
++	b->nr.bset_u64s[t - b->set]	+= delta;
++}
++
++#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
++	btree_keys_account_key(_nr, _bset_idx, _k, 1)
++#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
++	btree_keys_account_key(_nr, _bset_idx, _k, -1)
++
++#define btree_account_key_add(_b, _k)				\
++	btree_keys_account_key(&(_b)->nr,			\
++		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
++#define btree_account_key_drop(_b, _k)				\
++	btree_keys_account_key(&(_b)->nr,			\
++		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
++
++struct bset_stats {
++	struct {
++		size_t nr, bytes;
++	} sets[BSET_TREE_NR_TYPES];
++
++	size_t floats;
++	size_t failed;
++};
++
++void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
++void bch2_bfloat_to_text(struct printbuf *, struct btree *,
++			 struct bkey_packed *);
++
++/* Debug stuff */
++
++void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
++void bch2_dump_btree_node(struct bch_fs *, struct btree *);
++void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++
++void __bch2_verify_btree_nr_keys(struct btree *);
++void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
++void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
++			    struct bkey_packed *, unsigned);
++
++#else
++
++static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
++static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
++					      struct btree *b) {}
++static inline void bch2_verify_insert_pos(struct btree *b,
++					  struct bkey_packed *where,
++					  struct bkey_packed *insert,
++					  unsigned clobber_u64s) {}
++#endif
++
++static inline void bch2_verify_btree_nr_keys(struct btree *b)
++{
++	if (bch2_debug_check_btree_accounting)
++		__bch2_verify_btree_nr_keys(b);
++}
++
++#endif /* _BCACHEFS_BSET_H */
+diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
+new file mode 100644
+index 000000000000..579a8f8c6a65
+--- /dev/null
++++ b/fs/bcachefs/btree_cache.c
+@@ -0,0 +1,1170 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "bkey_buf.h"
++#include "btree_cache.h"
++#include "btree_io.h"
++#include "btree_iter.h"
++#include "btree_locking.h"
++#include "debug.h"
++#include "errcode.h"
++#include "error.h"
++
++#include <linux/prefetch.h>
++#include <linux/sched/mm.h>
++#include <trace/events/bcachefs.h>
++
++struct lock_class_key bch2_btree_node_lock_key;
++
++const char * const bch2_btree_node_flags[] = {
++#define x(f)	#f,
++	BTREE_FLAGS()
++#undef x
++	NULL
++};
++
++void bch2_recalc_btree_reserve(struct bch_fs *c)
++{
++	unsigned i, reserve = 16;
++
++	if (!c->btree_roots[0].b)
++		reserve += 8;
++
++	for (i = 0; i < BTREE_ID_NR; i++)
++		if (c->btree_roots[i].b)
++			reserve += min_t(unsigned, 1,
++					 c->btree_roots[i].b->c.level) * 8;
++
++	c->btree_cache.reserve = reserve;
++}
++
++static inline unsigned btree_cache_can_free(struct btree_cache *bc)
++{
++	return max_t(int, 0, bc->used - bc->reserve);
++}
++
++static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
++{
++	if (b->c.lock.readers)
++		list_move(&b->list, &bc->freed_pcpu);
++	else
++		list_move(&b->list, &bc->freed_nonpcpu);
++}
++
++static void btree_node_data_free(struct bch_fs *c, struct btree *b)
++{
++	struct btree_cache *bc = &c->btree_cache;
++
++	EBUG_ON(btree_node_write_in_flight(b));
++
++	kvpfree(b->data, btree_bytes(c));
++	b->data = NULL;
++#ifdef __KERNEL__
++	vfree(b->aux_data);
++#else
++	munmap(b->aux_data, btree_aux_data_bytes(b));
++#endif
++	b->aux_data = NULL;
++
++	bc->used--;
++
++	btree_node_to_freedlist(bc, b);
++}
++
++static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
++				   const void *obj)
++{
++	const struct btree *b = obj;
++	const u64 *v = arg->key;
++
++	return b->hash_val == *v ? 0 : 1;
++}
++
++static const struct rhashtable_params bch_btree_cache_params = {
++	.head_offset	= offsetof(struct btree, hash),
++	.key_offset	= offsetof(struct btree, hash_val),
++	.key_len	= sizeof(u64),
++	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
++};
++
++static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
++{
++	BUG_ON(b->data || b->aux_data);
++
++	b->data = kvpmalloc(btree_bytes(c), gfp);
++	if (!b->data)
++		return -ENOMEM;
++#ifdef __KERNEL__
++	b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
++#else
++	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
++			   PROT_READ|PROT_WRITE|PROT_EXEC,
++			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
++	if (b->aux_data == MAP_FAILED)
++		b->aux_data = NULL;
++#endif
++	if (!b->aux_data) {
++		kvpfree(b->data, btree_bytes(c));
++		b->data = NULL;
++		return -ENOMEM;
++	}
++
++	return 0;
++}
++
++static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
++{
++	struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
++	if (!b)
++		return NULL;
++
++	bkey_btree_ptr_init(&b->key);
++	__six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
++	INIT_LIST_HEAD(&b->list);
++	INIT_LIST_HEAD(&b->write_blocked);
++	b->byte_order = ilog2(btree_bytes(c));
++	return b;
++}
++
++struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	struct btree *b = __btree_node_mem_alloc(c);
++	if (!b)
++		return NULL;
++
++	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
++		kfree(b);
++		return NULL;
++	}
++
++	bc->used++;
++	list_add(&b->list, &bc->freeable);
++	return b;
++}
++
++/* Btree in memory cache - hash table */
++
++void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
++{
++	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
++	BUG_ON(ret);
++
++	/* Cause future lookups for this node to fail: */
++	b->hash_val = 0;
++
++	six_lock_wakeup_all(&b->c.lock);
++}
++
++int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
++{
++	BUG_ON(b->hash_val);
++	b->hash_val = btree_ptr_hash_val(&b->key);
++
++	return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
++					     bch_btree_cache_params);
++}
++
++int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
++				unsigned level, enum btree_id id)
++{
++	int ret;
++
++	b->c.level	= level;
++	b->c.btree_id	= id;
++
++	mutex_lock(&bc->lock);
++	ret = __bch2_btree_node_hash_insert(bc, b);
++	if (!ret)
++		list_add(&b->list, &bc->live);
++	mutex_unlock(&bc->lock);
++
++	return ret;
++}
++
++__flatten
++static inline struct btree *btree_cache_find(struct btree_cache *bc,
++				     const struct bkey_i *k)
++{
++	u64 v = btree_ptr_hash_val(k);
++
++	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
++}
++
++/*
++ * this version is for btree nodes that have already been freed (we're not
++ * reaping a real btree node)
++ */
++static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	int ret = 0;
++
++	lockdep_assert_held(&bc->lock);
++wait_on_io:
++	if (b->flags & ((1U << BTREE_NODE_dirty)|
++			(1U << BTREE_NODE_read_in_flight)|
++			(1U << BTREE_NODE_write_in_flight))) {
++		if (!flush)
++			return -ENOMEM;
++
++		/* XXX: waiting on IO with btree cache lock held */
++		bch2_btree_node_wait_on_read(b);
++		bch2_btree_node_wait_on_write(b);
++	}
++
++	if (!six_trylock_intent(&b->c.lock))
++		return -ENOMEM;
++
++	if (!six_trylock_write(&b->c.lock))
++		goto out_unlock_intent;
++
++	/* recheck under lock */
++	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
++			(1U << BTREE_NODE_write_in_flight))) {
++		if (!flush)
++			goto out_unlock;
++		six_unlock_write(&b->c.lock);
++		six_unlock_intent(&b->c.lock);
++		goto wait_on_io;
++	}
++
++	if (btree_node_noevict(b) ||
++	    btree_node_write_blocked(b) ||
++	    btree_node_will_make_reachable(b))
++		goto out_unlock;
++
++	if (btree_node_dirty(b)) {
++		if (!flush)
++			goto out_unlock;
++		/*
++		 * Using the underscore version because we don't want to compact
++		 * bsets after the write, since this node is about to be evicted
++		 * - unless btree verify mode is enabled, since it runs out of
++		 * the post write cleanup:
++		 */
++		if (bch2_verify_btree_ondisk)
++			bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
++		else
++			__bch2_btree_node_write(c, b, 0);
++
++		six_unlock_write(&b->c.lock);
++		six_unlock_intent(&b->c.lock);
++		goto wait_on_io;
++	}
++out:
++	if (b->hash_val && !ret)
++		trace_btree_node_reap(c, b);
++	return ret;
++out_unlock:
++	six_unlock_write(&b->c.lock);
++out_unlock_intent:
++	six_unlock_intent(&b->c.lock);
++	ret = -ENOMEM;
++	goto out;
++}
++
++static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
++{
++	return __btree_node_reclaim(c, b, false);
++}
++
++static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
++{
++	return __btree_node_reclaim(c, b, true);
++}
++
++static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
++					   struct shrink_control *sc)
++{
++	struct bch_fs *c = container_of(shrink, struct bch_fs,
++					btree_cache.shrink);
++	struct btree_cache *bc = &c->btree_cache;
++	struct btree *b, *t;
++	unsigned long nr = sc->nr_to_scan;
++	unsigned long can_free = 0;
++	unsigned long touched = 0;
++	unsigned long freed = 0;
++	unsigned i, flags;
++	unsigned long ret = SHRINK_STOP;
++
++	if (bch2_btree_shrinker_disabled)
++		return SHRINK_STOP;
++
++	/* Return -1 if we can't do anything right now */
++	if (sc->gfp_mask & __GFP_FS)
++		mutex_lock(&bc->lock);
++	else if (!mutex_trylock(&bc->lock))
++		goto out_norestore;
++
++	flags = memalloc_nofs_save();
++
++	/*
++	 * It's _really_ critical that we don't free too many btree nodes - we
++	 * have to always leave ourselves a reserve. The reserve is how we
++	 * guarantee that allocating memory for a new btree node can always
++	 * succeed, so that inserting keys into the btree can always succeed and
++	 * IO can always make forward progress:
++	 */
++	can_free = btree_cache_can_free(bc);
++	nr = min_t(unsigned long, nr, can_free);
++
++	i = 0;
++	list_for_each_entry_safe(b, t, &bc->freeable, list) {
++		/*
++		 * Leave a few nodes on the freeable list, so that a btree split
++		 * won't have to hit the system allocator:
++		 */
++		if (++i <= 3)
++			continue;
++
++		touched++;
++
++		if (touched >= nr)
++			break;
++
++		if (!btree_node_reclaim(c, b)) {
++			btree_node_data_free(c, b);
++			six_unlock_write(&b->c.lock);
++			six_unlock_intent(&b->c.lock);
++			freed++;
++		}
++	}
++restart:
++	list_for_each_entry_safe(b, t, &bc->live, list) {
++		/* tweak this */
++		if (btree_node_accessed(b)) {
++			clear_btree_node_accessed(b);
++			goto touched;
++		}
++
++		if (!btree_node_reclaim(c, b)) {
++			/* can't call bch2_btree_node_hash_remove under lock  */
++			freed++;
++			if (&t->list != &bc->live)
++				list_move_tail(&bc->live, &t->list);
++
++			btree_node_data_free(c, b);
++			mutex_unlock(&bc->lock);
++
++			bch2_btree_node_hash_remove(bc, b);
++			six_unlock_write(&b->c.lock);
++			six_unlock_intent(&b->c.lock);
++
++			if (freed >= nr)
++				goto out;
++
++			if (sc->gfp_mask & __GFP_FS)
++				mutex_lock(&bc->lock);
++			else if (!mutex_trylock(&bc->lock))
++				goto out;
++			goto restart;
++		} else {
++			continue;
++		}
++touched:
++		touched++;
++
++		if (touched >= nr) {
++			/* Save position */
++			if (&t->list != &bc->live)
++				list_move_tail(&bc->live, &t->list);
++			break;
++		}
++	}
++
++	mutex_unlock(&bc->lock);
++out:
++	ret = freed;
++	memalloc_nofs_restore(flags);
++out_norestore:
++	trace_btree_cache_scan(sc->nr_to_scan, can_free, ret);
++	return ret;
++}
++
++static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
++					    struct shrink_control *sc)
++{
++	struct bch_fs *c = container_of(shrink, struct bch_fs,
++					btree_cache.shrink);
++	struct btree_cache *bc = &c->btree_cache;
++
++	if (bch2_btree_shrinker_disabled)
++		return 0;
++
++	return btree_cache_can_free(bc);
++}
++
++static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
++{
++	struct bch_fs *c = container_of(shrink, struct bch_fs,
++					btree_cache.shrink);
++
++	bch2_btree_cache_to_text(out, c);
++}
++
++void bch2_fs_btree_cache_exit(struct bch_fs *c)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	struct btree *b;
++	unsigned i, flags;
++
++	if (bc->shrink.list.next)
++		unregister_shrinker(&bc->shrink);
++
++	/* vfree() can allocate memory: */
++	flags = memalloc_nofs_save();
++	mutex_lock(&bc->lock);
++
++	if (c->verify_data)
++		list_move(&c->verify_data->list, &bc->live);
++
++	kvpfree(c->verify_ondisk, btree_bytes(c));
++
++	for (i = 0; i < BTREE_ID_NR; i++)
++		if (c->btree_roots[i].b)
++			list_add(&c->btree_roots[i].b->list, &bc->live);
++
++	list_splice(&bc->freeable, &bc->live);
++
++	while (!list_empty(&bc->live)) {
++		b = list_first_entry(&bc->live, struct btree, list);
++
++		BUG_ON(btree_node_read_in_flight(b) ||
++		       btree_node_write_in_flight(b));
++
++		if (btree_node_dirty(b))
++			bch2_btree_complete_write(c, b, btree_current_write(b));
++		clear_btree_node_dirty_acct(c, b);
++
++		btree_node_data_free(c, b);
++	}
++
++	BUG_ON(atomic_read(&c->btree_cache.dirty));
++
++	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
++
++	while (!list_empty(&bc->freed_nonpcpu)) {
++		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
++		list_del(&b->list);
++		six_lock_pcpu_free(&b->c.lock);
++		kfree(b);
++	}
++
++	mutex_unlock(&bc->lock);
++	memalloc_nofs_restore(flags);
++
++	if (bc->table_init_done)
++		rhashtable_destroy(&bc->table);
++}
++
++int bch2_fs_btree_cache_init(struct bch_fs *c)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	unsigned i;
++	int ret = 0;
++
++	pr_verbose_init(c->opts, "");
++
++	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
++	if (ret)
++		goto out;
++
++	bc->table_init_done = true;
++
++	bch2_recalc_btree_reserve(c);
++
++	for (i = 0; i < bc->reserve; i++)
++		if (!__bch2_btree_node_mem_alloc(c)) {
++			ret = -ENOMEM;
++			goto out;
++		}
++
++	list_splice_init(&bc->live, &bc->freeable);
++
++	mutex_init(&c->verify_lock);
++
++	bc->shrink.count_objects	= bch2_btree_cache_count;
++	bc->shrink.scan_objects		= bch2_btree_cache_scan;
++	bc->shrink.to_text		= bch2_btree_cache_shrinker_to_text;
++	bc->shrink.seeks		= 4;
++	ret = register_shrinker(&bc->shrink);
++out:
++	pr_verbose_init(c->opts, "ret %i", ret);
++	return ret;
++}
++
++void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
++{
++	mutex_init(&bc->lock);
++	INIT_LIST_HEAD(&bc->live);
++	INIT_LIST_HEAD(&bc->freeable);
++	INIT_LIST_HEAD(&bc->freed_pcpu);
++	INIT_LIST_HEAD(&bc->freed_nonpcpu);
++}
++
++/*
++ * We can only have one thread cannibalizing other cached btree nodes at a time,
++ * or we'll deadlock. We use an open coded mutex to ensure that, which a
++ * cannibalize_bucket() will take. This means every time we unlock the root of
++ * the btree, we need to release this lock if we have it held.
++ */
++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
++{
++	struct btree_cache *bc = &c->btree_cache;
++
++	if (bc->alloc_lock == current) {
++		trace_btree_node_cannibalize_unlock(c);
++		bc->alloc_lock = NULL;
++		closure_wake_up(&bc->alloc_wait);
++	}
++}
++
++int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	struct task_struct *old;
++
++	old = cmpxchg(&bc->alloc_lock, NULL, current);
++	if (old == NULL || old == current)
++		goto success;
++
++	if (!cl) {
++		trace_btree_node_cannibalize_lock_fail(c);
++		return -ENOMEM;
++	}
++
++	closure_wait(&bc->alloc_wait, cl);
++
++	/* Try again, after adding ourselves to waitlist */
++	old = cmpxchg(&bc->alloc_lock, NULL, current);
++	if (old == NULL || old == current) {
++		/* We raced */
++		closure_wake_up(&bc->alloc_wait);
++		goto success;
++	}
++
++	trace_btree_node_cannibalize_lock_fail(c);
++	return -EAGAIN;
++
++success:
++	trace_btree_node_cannibalize_lock(c);
++	return 0;
++}
++
++static struct btree *btree_node_cannibalize(struct bch_fs *c)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	struct btree *b;
++
++	list_for_each_entry_reverse(b, &bc->live, list)
++		if (!btree_node_reclaim(c, b))
++			return b;
++
++	while (1) {
++		list_for_each_entry_reverse(b, &bc->live, list)
++			if (!btree_node_write_and_reclaim(c, b))
++				return b;
++
++		/*
++		 * Rare case: all nodes were intent-locked.
++		 * Just busy-wait.
++		 */
++		WARN_ONCE(1, "btree cache cannibalize failed\n");
++		cond_resched();
++	}
++}
++
++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	struct list_head *freed = pcpu_read_locks
++		? &bc->freed_pcpu
++		: &bc->freed_nonpcpu;
++	struct btree *b, *b2;
++	u64 start_time = local_clock();
++	unsigned flags;
++
++	flags = memalloc_nofs_save();
++	mutex_lock(&bc->lock);
++
++	/*
++	 * We never free struct btree itself, just the memory that holds the on
++	 * disk node. Check the freed list before allocating a new one:
++	 */
++	list_for_each_entry(b, freed, list)
++		if (!btree_node_reclaim(c, b)) {
++			list_del_init(&b->list);
++			goto got_node;
++		}
++
++	b = __btree_node_mem_alloc(c);
++	if (!b)
++		goto err_locked;
++
++	if (pcpu_read_locks)
++		six_lock_pcpu_alloc(&b->c.lock);
++
++	BUG_ON(!six_trylock_intent(&b->c.lock));
++	BUG_ON(!six_trylock_write(&b->c.lock));
++got_node:
++
++	/*
++	 * btree_free() doesn't free memory; it sticks the node on the end of
++	 * the list. Check if there's any freed nodes there:
++	 */
++	list_for_each_entry(b2, &bc->freeable, list)
++		if (!btree_node_reclaim(c, b2)) {
++			swap(b->data, b2->data);
++			swap(b->aux_data, b2->aux_data);
++			btree_node_to_freedlist(bc, b2);
++			six_unlock_write(&b2->c.lock);
++			six_unlock_intent(&b2->c.lock);
++			goto got_mem;
++		}
++
++	mutex_unlock(&bc->lock);
++
++	if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
++		goto err;
++
++	mutex_lock(&bc->lock);
++	bc->used++;
++got_mem:
++	mutex_unlock(&bc->lock);
++
++	BUG_ON(btree_node_hashed(b));
++	BUG_ON(btree_node_dirty(b));
++	BUG_ON(btree_node_write_in_flight(b));
++out:
++	b->flags		= 0;
++	b->written		= 0;
++	b->nsets		= 0;
++	b->sib_u64s[0]		= 0;
++	b->sib_u64s[1]		= 0;
++	b->whiteout_u64s	= 0;
++	bch2_btree_keys_init(b);
++	set_btree_node_accessed(b);
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
++			       start_time);
++
++	memalloc_nofs_restore(flags);
++	return b;
++err:
++	mutex_lock(&bc->lock);
++err_locked:
++	/* Try to cannibalize another cached btree node: */
++	if (bc->alloc_lock == current) {
++		b2 = btree_node_cannibalize(c);
++		bch2_btree_node_hash_remove(bc, b2);
++
++		if (b) {
++			swap(b->data, b2->data);
++			swap(b->aux_data, b2->aux_data);
++			btree_node_to_freedlist(bc, b2);
++			six_unlock_write(&b2->c.lock);
++			six_unlock_intent(&b2->c.lock);
++		} else {
++			b = b2;
++			list_del_init(&b->list);
++		}
++
++		mutex_unlock(&bc->lock);
++
++		trace_btree_node_cannibalize(c);
++		goto out;
++	}
++
++	mutex_unlock(&bc->lock);
++	memalloc_nofs_restore(flags);
++	return ERR_PTR(-ENOMEM);
++}
++
++/* Slowpath, don't want it inlined into btree_iter_traverse() */
++static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
++				struct btree_trans *trans,
++				struct btree_path *path,
++				const struct bkey_i *k,
++				enum btree_id btree_id,
++				unsigned level,
++				enum six_lock_type lock_type,
++				bool sync)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	struct btree *b;
++	u32 seq;
++
++	BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
++	/*
++	 * Parent node must be locked, else we could read in a btree node that's
++	 * been freed:
++	 */
++	if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
++		trace_trans_restart_relock_parent_for_fill(trans->fn,
++					_THIS_IP_, btree_id, &path->pos);
++		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
++	}
++
++	b = bch2_btree_node_mem_alloc(c, level != 0);
++
++	if (trans && b == ERR_PTR(-ENOMEM)) {
++		trans->memory_allocation_failure = true;
++		trace_trans_restart_memory_allocation_failure(trans->fn,
++				_THIS_IP_, btree_id, &path->pos);
++
++		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
++	}
++
++	if (IS_ERR(b))
++		return b;
++
++	bkey_copy(&b->key, k);
++	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
++		/* raced with another fill: */
++
++		/* mark as unhashed... */
++		b->hash_val = 0;
++
++		mutex_lock(&bc->lock);
++		list_add(&b->list, &bc->freeable);
++		mutex_unlock(&bc->lock);
++
++		six_unlock_write(&b->c.lock);
++		six_unlock_intent(&b->c.lock);
++		return NULL;
++	}
++
++	set_btree_node_read_in_flight(b);
++
++	six_unlock_write(&b->c.lock);
++	seq = b->c.lock.state.seq;
++	six_unlock_intent(&b->c.lock);
++
++	/* Unlock before doing IO: */
++	if (trans && sync)
++		bch2_trans_unlock(trans);
++
++	bch2_btree_node_read(c, b, sync);
++
++	if (!sync)
++		return NULL;
++
++	if (trans) {
++		int ret = bch2_trans_relock(trans) ?:
++			bch2_btree_path_relock_intent(trans, path);
++		if (ret) {
++			BUG_ON(!trans->restarted);
++			return ERR_PTR(ret);
++		}
++	}
++
++	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
++		trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
++					   btree_id, &path->pos);
++		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
++	}
++
++	return b;
++}
++
++static int lock_node_check_fn(struct six_lock *lock, void *p)
++{
++	struct btree *b = container_of(lock, struct btree, c.lock);
++	const struct bkey_i *k = p;
++
++	if (b->hash_val != btree_ptr_hash_val(k))
++		return BCH_ERR_lock_fail_node_reused;
++	return 0;
++}
++
++static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
++{
++	struct printbuf buf = PRINTBUF;
++
++	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
++		return;
++
++	prt_printf(&buf,
++	       "btree node header doesn't match ptr\n"
++	       "btree %s level %u\n"
++	       "ptr: ",
++	       bch2_btree_ids[b->c.btree_id], b->c.level);
++	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
++
++	prt_printf(&buf, "\nheader: btree %s level %llu\n"
++	       "min ",
++	       bch2_btree_ids[BTREE_NODE_ID(b->data)],
++	       BTREE_NODE_LEVEL(b->data));
++	bch2_bpos_to_text(&buf, b->data->min_key);
++
++	prt_printf(&buf, "\nmax ");
++	bch2_bpos_to_text(&buf, b->data->max_key);
++
++	bch2_fs_inconsistent(c, "%s", buf.buf);
++	printbuf_exit(&buf);
++}
++
++static inline void btree_check_header(struct bch_fs *c, struct btree *b)
++{
++	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
++	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
++	    bpos_cmp(b->data->max_key, b->key.k.p) ||
++	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
++	     bpos_cmp(b->data->min_key,
++		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
++		btree_bad_header(c, b);
++}
++
++/**
++ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
++ * in from disk if necessary.
++ *
++ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
++ *
++ * The btree node will have either a read or a write lock held, depending on
++ * the @write parameter.
++ */
++struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
++				  const struct bkey_i *k, unsigned level,
++				  enum six_lock_type lock_type,
++				  unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_cache *bc = &c->btree_cache;
++	struct btree *b;
++	struct bset_tree *t;
++	int ret;
++
++	EBUG_ON(level >= BTREE_MAX_DEPTH);
++
++	b = btree_node_mem_ptr(k);
++
++	/*
++	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
++	 * be the node we want anymore, and trying to lock the wrong node could
++	 * cause an unneccessary transaction restart:
++	 */
++	if (likely(c->opts.btree_node_mem_ptr_optimization &&
++		   b &&
++		   b->hash_val == btree_ptr_hash_val(k)))
++			goto lock_node;
++retry:
++	b = btree_cache_find(bc, k);
++	if (unlikely(!b)) {
++		/*
++		 * We must have the parent locked to call bch2_btree_node_fill(),
++		 * else we could read in a btree node from disk that's been
++		 * freed:
++		 */
++		b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
++					 level, lock_type, true);
++
++		/* We raced and found the btree node in the cache */
++		if (!b)
++			goto retry;
++
++		if (IS_ERR(b))
++			return b;
++	} else {
++lock_node:
++		/*
++		 * There's a potential deadlock with splits and insertions into
++		 * interior nodes we have to avoid:
++		 *
++		 * The other thread might be holding an intent lock on the node
++		 * we want, and they want to update its parent node so they're
++		 * going to upgrade their intent lock on the parent node to a
++		 * write lock.
++		 *
++		 * But if we're holding a read lock on the parent, and we're
++		 * trying to get the intent lock they're holding, we deadlock.
++		 *
++		 * So to avoid this we drop the read locks on parent nodes when
++		 * we're starting to take intent locks - and handle the race.
++		 *
++		 * The race is that they might be about to free the node we
++		 * want, and dropping our read lock on the parent node lets them
++		 * update the parent marking the node we want as freed, and then
++		 * free it:
++		 *
++		 * To guard against this, btree nodes are evicted from the cache
++		 * when they're freed - and b->hash_val is zeroed out, which we
++		 * check for after we lock the node.
++		 *
++		 * Then, bch2_btree_node_relock() on the parent will fail - because
++		 * the parent was modified, when the pointer to the node we want
++		 * was removed - and we'll bail out:
++		 */
++		if (btree_node_read_locked(path, level + 1))
++			btree_node_unlock(trans, path, level + 1);
++
++		ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type,
++				      lock_node_check_fn, (void *) k, trace_ip);
++		if (unlikely(ret)) {
++			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
++				goto retry;
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				return ERR_PTR(ret);
++			BUG();
++		}
++
++		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
++			     b->c.level != level ||
++			     race_fault())) {
++			six_unlock_type(&b->c.lock, lock_type);
++			if (bch2_btree_node_relock(trans, path, level + 1))
++				goto retry;
++
++			trace_trans_restart_btree_node_reused(trans->fn,
++							      trace_ip,
++							      path->btree_id,
++							      &path->pos);
++			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
++		}
++	}
++
++	if (unlikely(btree_node_read_in_flight(b))) {
++		u32 seq = b->c.lock.state.seq;
++
++		six_unlock_type(&b->c.lock, lock_type);
++		bch2_trans_unlock(trans);
++
++		bch2_btree_node_wait_on_read(b);
++
++		/*
++		 * should_be_locked is not set on this path yet, so we need to
++		 * relock it specifically:
++		 */
++		if (trans) {
++			int ret = bch2_trans_relock(trans) ?:
++				bch2_btree_path_relock_intent(trans, path);
++			if (ret) {
++				BUG_ON(!trans->restarted);
++				return ERR_PTR(ret);
++			}
++		}
++
++		if (!six_relock_type(&b->c.lock, lock_type, seq))
++			goto retry;
++	}
++
++	prefetch(b->aux_data);
++
++	for_each_bset(b, t) {
++		void *p = (u64 *) b->aux_data + t->aux_data_offset;
++
++		prefetch(p + L1_CACHE_BYTES * 0);
++		prefetch(p + L1_CACHE_BYTES * 1);
++		prefetch(p + L1_CACHE_BYTES * 2);
++	}
++
++	/* avoid atomic set bit if it's not needed: */
++	if (!btree_node_accessed(b))
++		set_btree_node_accessed(b);
++
++	if (unlikely(btree_node_read_error(b))) {
++		six_unlock_type(&b->c.lock, lock_type);
++		return ERR_PTR(-EIO);
++	}
++
++	EBUG_ON(b->c.btree_id != path->btree_id);
++	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
++	btree_check_header(c, b);
++
++	return b;
++}
++
++struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
++					 const struct bkey_i *k,
++					 enum btree_id btree_id,
++					 unsigned level,
++					 bool nofill)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	struct btree *b;
++	struct bset_tree *t;
++	int ret;
++
++	EBUG_ON(level >= BTREE_MAX_DEPTH);
++
++	if (c->opts.btree_node_mem_ptr_optimization) {
++		b = btree_node_mem_ptr(k);
++		if (b)
++			goto lock_node;
++	}
++retry:
++	b = btree_cache_find(bc, k);
++	if (unlikely(!b)) {
++		if (nofill)
++			goto out;
++
++		b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
++					 level, SIX_LOCK_read, true);
++
++		/* We raced and found the btree node in the cache */
++		if (!b)
++			goto retry;
++
++		if (IS_ERR(b) &&
++		    !bch2_btree_cache_cannibalize_lock(c, NULL))
++			goto retry;
++
++		if (IS_ERR(b))
++			goto out;
++	} else {
++lock_node:
++		ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
++		if (ret)
++			goto retry;
++
++		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
++			     b->c.btree_id != btree_id ||
++			     b->c.level != level)) {
++			six_unlock_read(&b->c.lock);
++			goto retry;
++		}
++	}
++
++	/* XXX: waiting on IO with btree locks held: */
++	__bch2_btree_node_wait_on_read(b);
++
++	prefetch(b->aux_data);
++
++	for_each_bset(b, t) {
++		void *p = (u64 *) b->aux_data + t->aux_data_offset;
++
++		prefetch(p + L1_CACHE_BYTES * 0);
++		prefetch(p + L1_CACHE_BYTES * 1);
++		prefetch(p + L1_CACHE_BYTES * 2);
++	}
++
++	/* avoid atomic set bit if it's not needed: */
++	if (!btree_node_accessed(b))
++		set_btree_node_accessed(b);
++
++	if (unlikely(btree_node_read_error(b))) {
++		six_unlock_read(&b->c.lock);
++		b = ERR_PTR(-EIO);
++		goto out;
++	}
++
++	EBUG_ON(b->c.btree_id != btree_id);
++	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
++	btree_check_header(c, b);
++out:
++	bch2_btree_cache_cannibalize_unlock(c);
++	return b;
++}
++
++int bch2_btree_node_prefetch(struct bch_fs *c,
++			     struct btree_trans *trans,
++			     struct btree_path *path,
++			     const struct bkey_i *k,
++			     enum btree_id btree_id, unsigned level)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	struct btree *b;
++
++	BUG_ON(trans && !btree_node_locked(path, level + 1));
++	BUG_ON(level >= BTREE_MAX_DEPTH);
++
++	b = btree_cache_find(bc, k);
++	if (b)
++		return 0;
++
++	b = bch2_btree_node_fill(c, trans, path, k, btree_id,
++				 level, SIX_LOCK_read, false);
++	return PTR_ERR_OR_ZERO(b);
++}
++
++void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
++{
++	struct btree_cache *bc = &c->btree_cache;
++	struct btree *b;
++
++	b = btree_cache_find(bc, k);
++	if (!b)
++		return;
++wait_on_io:
++	/* not allowed to wait on io with btree locks held: */
++
++	/* XXX we're called from btree_gc which will be holding other btree
++	 * nodes locked
++	 * */
++	__bch2_btree_node_wait_on_read(b);
++	__bch2_btree_node_wait_on_write(b);
++
++	six_lock_intent(&b->c.lock, NULL, NULL);
++	six_lock_write(&b->c.lock, NULL, NULL);
++
++	if (btree_node_dirty(b)) {
++		__bch2_btree_node_write(c, b, 0);
++		six_unlock_write(&b->c.lock);
++		six_unlock_intent(&b->c.lock);
++		goto wait_on_io;
++	}
++
++	BUG_ON(btree_node_dirty(b));
++
++	mutex_lock(&bc->lock);
++	btree_node_data_free(c, b);
++	bch2_btree_node_hash_remove(bc, b);
++	mutex_unlock(&bc->lock);
++
++	six_unlock_write(&b->c.lock);
++	six_unlock_intent(&b->c.lock);
++}
++
++void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
++			     struct btree *b)
++{
++	const struct bkey_format *f = &b->format;
++	struct bset_stats stats;
++
++	memset(&stats, 0, sizeof(stats));
++
++	bch2_btree_keys_stats(b, &stats);
++
++	prt_printf(out, "l %u ", b->c.level);
++	bch2_bpos_to_text(out, b->data->min_key);
++	prt_printf(out, " - ");
++	bch2_bpos_to_text(out, b->data->max_key);
++	prt_printf(out, ":\n"
++	       "    ptrs: ");
++	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
++
++	prt_printf(out, "\n"
++	       "    format: u64s %u fields %u %u %u %u %u\n"
++	       "    unpack fn len: %u\n"
++	       "    bytes used %zu/%zu (%zu%% full)\n"
++	       "    sib u64s: %u, %u (merge threshold %u)\n"
++	       "    nr packed keys %u\n"
++	       "    nr unpacked keys %u\n"
++	       "    floats %zu\n"
++	       "    failed unpacked %zu\n",
++	       f->key_u64s,
++	       f->bits_per_field[0],
++	       f->bits_per_field[1],
++	       f->bits_per_field[2],
++	       f->bits_per_field[3],
++	       f->bits_per_field[4],
++	       b->unpack_fn_len,
++	       b->nr.live_u64s * sizeof(u64),
++	       btree_bytes(c) - sizeof(struct btree_node),
++	       b->nr.live_u64s * 100 / btree_max_u64s(c),
++	       b->sib_u64s[0],
++	       b->sib_u64s[1],
++	       c->btree_foreground_merge_threshold,
++	       b->nr.packed_keys,
++	       b->nr.unpacked_keys,
++	       stats.floats,
++	       stats.failed);
++}
++
++void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
++	prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
++	prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
++}
+diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
+new file mode 100644
+index 000000000000..25906127c023
+--- /dev/null
++++ b/fs/bcachefs/btree_cache.h
+@@ -0,0 +1,107 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BTREE_CACHE_H
++#define _BCACHEFS_BTREE_CACHE_H
++
++#include "bcachefs.h"
++#include "btree_types.h"
++
++extern struct lock_class_key bch2_btree_node_lock_key;
++
++extern const char * const bch2_btree_node_flags[];
++
++struct btree_iter;
++
++void bch2_recalc_btree_reserve(struct bch_fs *);
++
++void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
++int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
++int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
++				unsigned, enum btree_id);
++
++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
++int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
++
++struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
++
++struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
++				  const struct bkey_i *, unsigned,
++				  enum six_lock_type, unsigned long);
++
++struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
++					 enum btree_id, unsigned, bool);
++
++int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
++			     const struct bkey_i *, enum btree_id, unsigned);
++
++void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
++
++void bch2_fs_btree_cache_exit(struct bch_fs *);
++int bch2_fs_btree_cache_init(struct bch_fs *);
++void bch2_fs_btree_cache_init_early(struct btree_cache *);
++
++static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
++{
++	switch (k->k.type) {
++	case KEY_TYPE_btree_ptr:
++		return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
++	case KEY_TYPE_btree_ptr_v2:
++		return bkey_i_to_btree_ptr_v2_c(k)->v.seq;
++	default:
++		return 0;
++	}
++}
++
++static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
++{
++	return k->k.type == KEY_TYPE_btree_ptr_v2
++		? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
++		: NULL;
++}
++
++/* is btree node in hash table? */
++static inline bool btree_node_hashed(struct btree *b)
++{
++	return b->hash_val != 0;
++}
++
++#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
++	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,	\
++					  &(_c)->btree_cache.table),	\
++	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
++		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
++
++static inline size_t btree_bytes(struct bch_fs *c)
++{
++	return c->opts.btree_node_size;
++}
++
++static inline size_t btree_max_u64s(struct bch_fs *c)
++{
++	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
++}
++
++static inline size_t btree_pages(struct bch_fs *c)
++{
++	return btree_bytes(c) / PAGE_SIZE;
++}
++
++static inline unsigned btree_blocks(struct bch_fs *c)
++{
++	return btree_sectors(c) >> c->block_bits;
++}
++
++#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)
++
++#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
++#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
++	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
++	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
++
++#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->c.btree_id].b)
++
++void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
++			     struct btree *);
++void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
++
++#endif /* _BCACHEFS_BTREE_CACHE_H */
+diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
+new file mode 100644
+index 000000000000..2f563365ea4c
+--- /dev/null
++++ b/fs/bcachefs/btree_gc.c
+@@ -0,0 +1,2098 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
++ * Copyright (C) 2014 Datera Inc.
++ */
++
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "alloc_foreground.h"
++#include "bkey_methods.h"
++#include "bkey_buf.h"
++#include "btree_key_cache.h"
++#include "btree_locking.h"
++#include "btree_update_interior.h"
++#include "btree_io.h"
++#include "btree_gc.h"
++#include "buckets.h"
++#include "clock.h"
++#include "debug.h"
++#include "ec.h"
++#include "error.h"
++#include "extents.h"
++#include "journal.h"
++#include "keylist.h"
++#include "move.h"
++#include "recovery.h"
++#include "reflink.h"
++#include "replicas.h"
++#include "super-io.h"
++
++#include <linux/slab.h>
++#include <linux/bitops.h>
++#include <linux/freezer.h>
++#include <linux/kthread.h>
++#include <linux/preempt.h>
++#include <linux/rcupdate.h>
++#include <linux/sched/task.h>
++#include <trace/events/bcachefs.h>
++
++#define DROP_THIS_NODE		10
++#define DROP_PREV_NODE		11
++
++static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
++{
++	preempt_disable();
++	write_seqcount_begin(&c->gc_pos_lock);
++	c->gc_pos = new_pos;
++	write_seqcount_end(&c->gc_pos_lock);
++	preempt_enable();
++}
++
++static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
++{
++	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
++	__gc_pos_set(c, new_pos);
++}
++
++/*
++ * Missing: if an interior btree node is empty, we need to do something -
++ * perhaps just kill it
++ */
++static int bch2_gc_check_topology(struct bch_fs *c,
++				  struct btree *b,
++				  struct bkey_buf *prev,
++				  struct bkey_buf cur,
++				  bool is_last)
++{
++	struct bpos node_start	= b->data->min_key;
++	struct bpos node_end	= b->data->max_key;
++	struct bpos expected_start = bkey_deleted(&prev->k->k)
++		? node_start
++		: bpos_successor(prev->k->k.p);
++	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
++	int ret = 0;
++
++	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
++		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
++
++		if (bpos_cmp(expected_start, bp->v.min_key)) {
++			bch2_topology_error(c);
++
++			if (bkey_deleted(&prev->k->k)) {
++				prt_printf(&buf1, "start of node: ");
++				bch2_bpos_to_text(&buf1, node_start);
++			} else {
++				bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
++			}
++			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
++
++			if (__fsck_err(c,
++				  FSCK_CAN_FIX|
++				  FSCK_CAN_IGNORE|
++				  FSCK_NO_RATELIMIT,
++				  "btree node with incorrect min_key at btree %s level %u:\n"
++				  "  prev %s\n"
++				  "  cur %s",
++				  bch2_btree_ids[b->c.btree_id], b->c.level,
++				  buf1.buf, buf2.buf) &&
++			    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
++				bch_info(c, "Halting mark and sweep to start topology repair pass");
++				ret = -BCH_ERR_need_topology_repair;
++				goto err;
++			} else {
++				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
++			}
++		}
++	}
++
++	if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
++		bch2_topology_error(c);
++
++		printbuf_reset(&buf1);
++		printbuf_reset(&buf2);
++
++		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
++		bch2_bpos_to_text(&buf2, node_end);
++
++		if (__fsck_err(c,
++			  FSCK_CAN_FIX|
++			  FSCK_CAN_IGNORE|
++			  FSCK_NO_RATELIMIT,
++			  "btree node with incorrect max_key at btree %s level %u:\n"
++			  "  %s\n"
++			  "  expected %s",
++			  bch2_btree_ids[b->c.btree_id], b->c.level,
++			  buf1.buf, buf2.buf) &&
++		    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
++			bch_info(c, "Halting mark and sweep to start topology repair pass");
++			ret = -BCH_ERR_need_topology_repair;
++			goto err;
++		} else {
++			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
++		}
++	}
++
++	bch2_bkey_buf_copy(prev, c, cur.k);
++err:
++fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
++	return ret;
++}
++
++static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
++{
++	switch (b->key.k.type) {
++	case KEY_TYPE_btree_ptr: {
++		struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
++
++		dst->k.p		= src->k.p;
++		dst->v.mem_ptr		= 0;
++		dst->v.seq		= b->data->keys.seq;
++		dst->v.sectors_written	= 0;
++		dst->v.flags		= 0;
++		dst->v.min_key		= b->data->min_key;
++		set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
++		memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
++		break;
++	}
++	case KEY_TYPE_btree_ptr_v2:
++		bkey_copy(&dst->k_i, &b->key);
++		break;
++	default:
++		BUG();
++	}
++}
++
++static void bch2_btree_node_update_key_early(struct bch_fs *c,
++					     enum btree_id btree, unsigned level,
++					     struct bkey_s_c old, struct bkey_i *new)
++{
++	struct btree *b;
++	struct bkey_buf tmp;
++	int ret;
++
++	bch2_bkey_buf_init(&tmp);
++	bch2_bkey_buf_reassemble(&tmp, c, old);
++
++	b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
++	if (!IS_ERR_OR_NULL(b)) {
++		mutex_lock(&c->btree_cache.lock);
++
++		bch2_btree_node_hash_remove(&c->btree_cache, b);
++
++		bkey_copy(&b->key, new);
++		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
++		BUG_ON(ret);
++
++		mutex_unlock(&c->btree_cache.lock);
++		six_unlock_read(&b->c.lock);
++	}
++
++	bch2_bkey_buf_exit(&tmp, c);
++}
++
++static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
++{
++	struct bkey_i_btree_ptr_v2 *new;
++	int ret;
++
++	new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
++	if (!new)
++		return -ENOMEM;
++
++	btree_ptr_to_v2(b, new);
++	b->data->min_key	= new_min;
++	new->v.min_key		= new_min;
++	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
++
++	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
++	if (ret) {
++		kfree(new);
++		return ret;
++	}
++
++	bch2_btree_node_drop_keys_outside_node(b);
++	bkey_copy(&b->key, &new->k_i);
++	return 0;
++}
++
++static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
++{
++	struct bkey_i_btree_ptr_v2 *new;
++	int ret;
++
++	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
++	if (ret)
++		return ret;
++
++	new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
++	if (!new)
++		return -ENOMEM;
++
++	btree_ptr_to_v2(b, new);
++	b->data->max_key	= new_max;
++	new->k.p		= new_max;
++	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
++
++	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
++	if (ret) {
++		kfree(new);
++		return ret;
++	}
++
++	bch2_btree_node_drop_keys_outside_node(b);
++
++	mutex_lock(&c->btree_cache.lock);
++	bch2_btree_node_hash_remove(&c->btree_cache, b);
++
++	bkey_copy(&b->key, &new->k_i);
++	ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
++	BUG_ON(ret);
++	mutex_unlock(&c->btree_cache.lock);
++	return 0;
++}
++
++static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
++					struct btree *prev, struct btree *cur)
++{
++	struct bpos expected_start = !prev
++		? b->data->min_key
++		: bpos_successor(prev->key.k.p);
++	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
++	int ret = 0;
++
++	if (!prev) {
++		prt_printf(&buf1, "start of node: ");
++		bch2_bpos_to_text(&buf1, b->data->min_key);
++	} else {
++		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
++	}
++
++	bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
++
++	if (prev &&
++	    bpos_cmp(expected_start, cur->data->min_key) > 0 &&
++	    BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
++		/* cur overwrites prev: */
++
++		if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key,
++						 cur->data->min_key) >= 0, c,
++				"btree node overwritten by next node at btree %s level %u:\n"
++				"  node %s\n"
++				"  next %s",
++				bch2_btree_ids[b->c.btree_id], b->c.level,
++				buf1.buf, buf2.buf)) {
++			ret = DROP_PREV_NODE;
++			goto out;
++		}
++
++		if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
++						 bpos_predecessor(cur->data->min_key)), c,
++				"btree node with incorrect max_key at btree %s level %u:\n"
++				"  node %s\n"
++				"  next %s",
++				bch2_btree_ids[b->c.btree_id], b->c.level,
++				buf1.buf, buf2.buf))
++			ret = set_node_max(c, prev,
++					   bpos_predecessor(cur->data->min_key));
++	} else {
++		/* prev overwrites cur: */
++
++		if (mustfix_fsck_err_on(bpos_cmp(expected_start,
++						 cur->data->max_key) >= 0, c,
++				"btree node overwritten by prev node at btree %s level %u:\n"
++				"  prev %s\n"
++				"  node %s",
++				bch2_btree_ids[b->c.btree_id], b->c.level,
++				buf1.buf, buf2.buf)) {
++			ret = DROP_THIS_NODE;
++			goto out;
++		}
++
++		if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
++				"btree node with incorrect min_key at btree %s level %u:\n"
++				"  prev %s\n"
++				"  node %s",
++				bch2_btree_ids[b->c.btree_id], b->c.level,
++				buf1.buf, buf2.buf))
++		    ret = set_node_min(c, cur, expected_start);
++	}
++out:
++fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
++	return ret;
++}
++
++static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
++				 struct btree *child)
++{
++	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
++	int ret = 0;
++
++	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
++	bch2_bpos_to_text(&buf2, b->key.k.p);
++
++	if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
++			"btree node with incorrect max_key at btree %s level %u:\n"
++			"  %s\n"
++			"  expected %s",
++			bch2_btree_ids[b->c.btree_id], b->c.level,
++			buf1.buf, buf2.buf)) {
++		ret = set_node_max(c, child, b->key.k.p);
++		if (ret)
++			goto err;
++	}
++err:
++fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
++	return ret;
++}
++
++static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
++{
++	struct btree_and_journal_iter iter;
++	struct bkey_s_c k;
++	struct bkey_buf prev_k, cur_k;
++	struct btree *prev = NULL, *cur = NULL;
++	bool have_child, dropped_children = false;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	if (!b->c.level)
++		return 0;
++again:
++	prev = NULL;
++	have_child = dropped_children = false;
++	bch2_bkey_buf_init(&prev_k);
++	bch2_bkey_buf_init(&cur_k);
++	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
++
++	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
++		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
++		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
++
++		bch2_btree_and_journal_iter_advance(&iter);
++		bch2_bkey_buf_reassemble(&cur_k, c, k);
++
++		cur = bch2_btree_node_get_noiter(c, cur_k.k,
++					b->c.btree_id, b->c.level - 1,
++					false);
++		ret = PTR_ERR_OR_ZERO(cur);
++
++		printbuf_reset(&buf);
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
++
++		if (mustfix_fsck_err_on(ret == -EIO, c,
++				"Topology repair: unreadable btree node at btree %s level %u:\n"
++				"  %s",
++				bch2_btree_ids[b->c.btree_id],
++				b->c.level - 1,
++				buf.buf)) {
++			bch2_btree_node_evict(c, cur_k.k);
++			ret = bch2_journal_key_delete(c, b->c.btree_id,
++						      b->c.level, cur_k.k->k.p);
++			cur = NULL;
++			if (ret)
++				break;
++			continue;
++		}
++
++		if (ret) {
++			bch_err(c, "%s: error getting btree node: %s",
++				__func__, bch2_err_str(ret));
++			break;
++		}
++
++		ret = btree_repair_node_boundaries(c, b, prev, cur);
++
++		if (ret == DROP_THIS_NODE) {
++			six_unlock_read(&cur->c.lock);
++			bch2_btree_node_evict(c, cur_k.k);
++			ret = bch2_journal_key_delete(c, b->c.btree_id,
++						      b->c.level, cur_k.k->k.p);
++			cur = NULL;
++			if (ret)
++				break;
++			continue;
++		}
++
++		if (prev)
++			six_unlock_read(&prev->c.lock);
++		prev = NULL;
++
++		if (ret == DROP_PREV_NODE) {
++			bch2_btree_node_evict(c, prev_k.k);
++			ret = bch2_journal_key_delete(c, b->c.btree_id,
++						      b->c.level, prev_k.k->k.p);
++			if (ret)
++				break;
++
++			bch2_btree_and_journal_iter_exit(&iter);
++			bch2_bkey_buf_exit(&prev_k, c);
++			bch2_bkey_buf_exit(&cur_k, c);
++			goto again;
++		} else if (ret)
++			break;
++
++		prev = cur;
++		cur = NULL;
++		bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
++	}
++
++	if (!ret && !IS_ERR_OR_NULL(prev)) {
++		BUG_ON(cur);
++		ret = btree_repair_node_end(c, b, prev);
++	}
++
++	if (!IS_ERR_OR_NULL(prev))
++		six_unlock_read(&prev->c.lock);
++	prev = NULL;
++	if (!IS_ERR_OR_NULL(cur))
++		six_unlock_read(&cur->c.lock);
++	cur = NULL;
++
++	if (ret)
++		goto err;
++
++	bch2_btree_and_journal_iter_exit(&iter);
++	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
++
++	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
++		bch2_bkey_buf_reassemble(&cur_k, c, k);
++		bch2_btree_and_journal_iter_advance(&iter);
++
++		cur = bch2_btree_node_get_noiter(c, cur_k.k,
++					b->c.btree_id, b->c.level - 1,
++					false);
++		ret = PTR_ERR_OR_ZERO(cur);
++
++		if (ret) {
++			bch_err(c, "%s: error getting btree node: %s",
++				__func__, bch2_err_str(ret));
++			goto err;
++		}
++
++		ret = bch2_btree_repair_topology_recurse(c, cur);
++		six_unlock_read(&cur->c.lock);
++		cur = NULL;
++
++		if (ret == DROP_THIS_NODE) {
++			bch2_btree_node_evict(c, cur_k.k);
++			ret = bch2_journal_key_delete(c, b->c.btree_id,
++						      b->c.level, cur_k.k->k.p);
++			dropped_children = true;
++		}
++
++		if (ret)
++			goto err;
++
++		have_child = true;
++	}
++
++	printbuf_reset(&buf);
++	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
++
++	if (mustfix_fsck_err_on(!have_child, c,
++			"empty interior btree node at btree %s level %u\n"
++			"  %s",
++			bch2_btree_ids[b->c.btree_id],
++			b->c.level, buf.buf))
++		ret = DROP_THIS_NODE;
++err:
++fsck_err:
++	if (!IS_ERR_OR_NULL(prev))
++		six_unlock_read(&prev->c.lock);
++	if (!IS_ERR_OR_NULL(cur))
++		six_unlock_read(&cur->c.lock);
++
++	bch2_btree_and_journal_iter_exit(&iter);
++	bch2_bkey_buf_exit(&prev_k, c);
++	bch2_bkey_buf_exit(&cur_k, c);
++
++	if (!ret && dropped_children)
++		goto again;
++
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int bch2_repair_topology(struct bch_fs *c)
++{
++	struct btree *b;
++	unsigned i;
++	int ret = 0;
++
++	for (i = 0; i < BTREE_ID_NR && !ret; i++) {
++		b = c->btree_roots[i].b;
++		if (btree_node_fake(b))
++			continue;
++
++		six_lock_read(&b->c.lock, NULL, NULL);
++		ret = bch2_btree_repair_topology_recurse(c, b);
++		six_unlock_read(&b->c.lock);
++
++		if (ret == DROP_THIS_NODE) {
++			bch_err(c, "empty btree root - repair unimplemented");
++			ret = -BCH_ERR_fsck_repair_unimplemented;
++		}
++	}
++
++	return ret;
++}
++
++static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
++			       unsigned level, bool is_root,
++			       struct bkey_s_c *k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p = { 0 };
++	bool do_update = false;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	/*
++	 * XXX
++	 * use check_bucket_ref here
++	 */
++	bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
++		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
++		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
++
++		if (c->opts.reconstruct_alloc ||
++		    fsck_err_on(!g->gen_valid, c,
++				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
++				"while marking %s",
++				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
++				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
++				p.ptr.gen,
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
++			if (!p.ptr.cached) {
++				g->gen_valid		= true;
++				g->gen			= p.ptr.gen;
++			} else {
++				do_update = true;
++			}
++		}
++
++		if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c,
++				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
++				"while marking %s",
++				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
++				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
++				p.ptr.gen, g->gen,
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
++			if (!p.ptr.cached) {
++				g->gen_valid		= true;
++				g->gen			= p.ptr.gen;
++				g->data_type		= 0;
++				g->dirty_sectors	= 0;
++				g->cached_sectors	= 0;
++				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
++			} else {
++				do_update = true;
++			}
++		}
++
++		if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
++				"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
++				"while marking %s",
++				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
++				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
++				p.ptr.gen,
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
++			do_update = true;
++
++		if (fsck_err_on(!p.ptr.cached &&
++				gen_cmp(p.ptr.gen, g->gen) < 0, c,
++				"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
++				"while marking %s",
++				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
++				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
++				p.ptr.gen, g->gen,
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
++			do_update = true;
++
++		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
++			continue;
++
++		if (fsck_err_on(g->data_type &&
++				g->data_type != data_type, c,
++				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
++				"while marking %s",
++				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
++				bch2_data_types[g->data_type],
++				bch2_data_types[data_type],
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
++			if (data_type == BCH_DATA_btree) {
++				g->data_type	= data_type;
++				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
++			} else {
++				do_update = true;
++			}
++		}
++
++		if (p.has_ec) {
++			struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
++
++			if (fsck_err_on(!m || !m->alive, c,
++					"pointer to nonexistent stripe %llu\n"
++					"while marking %s",
++					(u64) p.ec.idx,
++					(printbuf_reset(&buf),
++					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
++				do_update = true;
++
++			if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
++					"pointer does not match stripe %llu\n"
++					"while marking %s",
++					(u64) p.ec.idx,
++					(printbuf_reset(&buf),
++					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
++				do_update = true;
++		}
++	}
++
++	if (do_update) {
++		struct bkey_ptrs ptrs;
++		union bch_extent_entry *entry;
++		struct bch_extent_ptr *ptr;
++		struct bkey_i *new;
++
++		if (is_root) {
++			bch_err(c, "cannot update btree roots yet");
++			ret = -EINVAL;
++			goto err;
++		}
++
++		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
++		if (!new) {
++			bch_err(c, "%s: error allocating new key", __func__);
++			ret = -ENOMEM;
++			goto err;
++		}
++
++		bkey_reassemble(new, *k);
++
++		if (level) {
++			/*
++			 * We don't want to drop btree node pointers - if the
++			 * btree node isn't there anymore, the read path will
++			 * sort it out:
++			 */
++			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
++			bkey_for_each_ptr(ptrs, ptr) {
++				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
++
++				ptr->gen = g->gen;
++			}
++		} else {
++			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
++				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
++				enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
++
++				(ptr->cached &&
++				 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
++				(!ptr->cached &&
++				 gen_cmp(ptr->gen, g->gen) < 0) ||
++				gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
++				(g->data_type &&
++				 g->data_type != data_type);
++			}));
++again:
++			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
++			bkey_extent_entry_for_each(ptrs, entry) {
++				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
++					struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
++									entry->stripe_ptr.idx);
++					union bch_extent_entry *next_ptr;
++
++					bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
++						if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
++							goto found;
++					next_ptr = NULL;
++found:
++					if (!next_ptr) {
++						bch_err(c, "aieee, found stripe ptr with no data ptr");
++						continue;
++					}
++
++					if (!m || !m->alive ||
++					    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
++								       &next_ptr->ptr,
++								       m->sectors)) {
++						bch2_bkey_extent_entry_drop(new, entry);
++						goto again;
++					}
++				}
++			}
++		}
++
++		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
++		if (ret) {
++			kfree(new);
++			goto err;
++		}
++
++		if (level)
++			bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
++
++		if (c->opts.verbose) {
++			printbuf_reset(&buf);
++			bch2_bkey_val_to_text(&buf, c, *k);
++			bch_info(c, "updated %s", buf.buf);
++
++			printbuf_reset(&buf);
++			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
++			bch_info(c, "new key %s", buf.buf);
++		}
++
++		*k = bkey_i_to_s_c(new);
++	}
++err:
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++/* marking of btree keys/nodes: */
++
++static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
++			    unsigned level, bool is_root,
++			    struct bkey_s_c *k,
++			    bool initial)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey deleted = KEY(0, 0, 0);
++	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
++	unsigned flags =
++		BTREE_TRIGGER_GC|
++		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
++	int ret = 0;
++
++	deleted.p = k->k->p;
++
++	if (initial) {
++		BUG_ON(bch2_journal_seq_verify &&
++		       k->k->version.lo > atomic64_read(&c->journal.seq));
++
++		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
++		if (ret)
++			goto err;
++
++		if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
++				"key version number higher than recorded: %llu > %llu",
++				k->k->version.lo,
++				atomic64_read(&c->key_version)))
++			atomic64_set(&c->key_version, k->k->version.lo);
++	}
++
++	ret = commit_do(trans, NULL, NULL, 0,
++			bch2_mark_key(trans, old, *k, flags));
++fsck_err:
++err:
++	if (ret)
++		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
++	return ret;
++}
++
++static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_node_iter iter;
++	struct bkey unpacked;
++	struct bkey_s_c k;
++	struct bkey_buf prev, cur;
++	int ret = 0;
++
++	if (!btree_node_type_needs_gc(btree_node_type(b)))
++		return 0;
++
++	bch2_btree_node_iter_init_from_start(&iter, b);
++	bch2_bkey_buf_init(&prev);
++	bch2_bkey_buf_init(&cur);
++	bkey_init(&prev.k->k);
++
++	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
++		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
++				       &k, initial);
++		if (ret)
++			break;
++
++		bch2_btree_node_iter_advance(&iter, b);
++
++		if (b->c.level) {
++			bch2_bkey_buf_reassemble(&cur, c, k);
++
++			ret = bch2_gc_check_topology(c, b, &prev, cur,
++					bch2_btree_node_iter_end(&iter));
++			if (ret)
++				break;
++		}
++	}
++
++	bch2_bkey_buf_exit(&cur, c);
++	bch2_bkey_buf_exit(&prev, c);
++	return ret;
++}
++
++static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
++			 bool initial, bool metadata_only)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct btree *b;
++	unsigned depth = metadata_only ? 1 : 0;
++	int ret = 0;
++
++	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
++
++	__for_each_btree_node(trans, iter, btree_id, POS_MIN,
++			      0, depth, BTREE_ITER_PREFETCH, b, ret) {
++		bch2_verify_btree_nr_keys(b);
++
++		gc_pos_set(c, gc_pos_btree_node(b));
++
++		ret = btree_gc_mark_node(trans, b, initial);
++		if (ret)
++			break;
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	if (ret)
++		return ret;
++
++	mutex_lock(&c->btree_root_lock);
++	b = c->btree_roots[btree_id].b;
++	if (!btree_node_fake(b)) {
++		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
++
++		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
++				       true, &k, initial);
++	}
++	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
++	mutex_unlock(&c->btree_root_lock);
++
++	return ret;
++}
++
++static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
++				      unsigned target_depth)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_and_journal_iter iter;
++	struct bkey_s_c k;
++	struct bkey_buf cur, prev;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
++	bch2_bkey_buf_init(&prev);
++	bch2_bkey_buf_init(&cur);
++	bkey_init(&prev.k->k);
++
++	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
++		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
++		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
++
++		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
++				       false, &k, true);
++		if (ret) {
++			bch_err(c, "%s: error from bch2_gc_mark_key: %s",
++				__func__, bch2_err_str(ret));
++			goto fsck_err;
++		}
++
++		if (b->c.level) {
++			bch2_bkey_buf_reassemble(&cur, c, k);
++			k = bkey_i_to_s_c(cur.k);
++
++			bch2_btree_and_journal_iter_advance(&iter);
++
++			ret = bch2_gc_check_topology(c, b,
++					&prev, cur,
++					!bch2_btree_and_journal_iter_peek(&iter).k);
++			if (ret)
++				goto fsck_err;
++		} else {
++			bch2_btree_and_journal_iter_advance(&iter);
++		}
++	}
++
++	if (b->c.level > target_depth) {
++		bch2_btree_and_journal_iter_exit(&iter);
++		bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
++
++		while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
++			struct btree *child;
++
++			bch2_bkey_buf_reassemble(&cur, c, k);
++			bch2_btree_and_journal_iter_advance(&iter);
++
++			child = bch2_btree_node_get_noiter(c, cur.k,
++						b->c.btree_id, b->c.level - 1,
++						false);
++			ret = PTR_ERR_OR_ZERO(child);
++
++			if (ret == -EIO) {
++				bch2_topology_error(c);
++
++				if (__fsck_err(c,
++					  FSCK_CAN_FIX|
++					  FSCK_CAN_IGNORE|
++					  FSCK_NO_RATELIMIT,
++					  "Unreadable btree node at btree %s level %u:\n"
++					  "  %s",
++					  bch2_btree_ids[b->c.btree_id],
++					  b->c.level - 1,
++					  (printbuf_reset(&buf),
++					   bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
++				    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
++					ret = -BCH_ERR_need_topology_repair;
++					bch_info(c, "Halting mark and sweep to start topology repair pass");
++					goto fsck_err;
++				} else {
++					/* Continue marking when opted to not
++					 * fix the error: */
++					ret = 0;
++					set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
++					continue;
++				}
++			} else if (ret) {
++				bch_err(c, "%s: error getting btree node: %s",
++					__func__, bch2_err_str(ret));
++				break;
++			}
++
++			ret = bch2_gc_btree_init_recurse(trans, child,
++							 target_depth);
++			six_unlock_read(&child->c.lock);
++
++			if (ret)
++				break;
++		}
++	}
++fsck_err:
++	bch2_bkey_buf_exit(&cur, c);
++	bch2_bkey_buf_exit(&prev, c);
++	bch2_btree_and_journal_iter_exit(&iter);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int bch2_gc_btree_init(struct btree_trans *trans,
++			      enum btree_id btree_id,
++			      bool metadata_only)
++{
++	struct bch_fs *c = trans->c;
++	struct btree *b;
++	unsigned target_depth = metadata_only ? 1 : 0;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	b = c->btree_roots[btree_id].b;
++
++	if (btree_node_fake(b))
++		return 0;
++
++	six_lock_read(&b->c.lock, NULL, NULL);
++	printbuf_reset(&buf);
++	bch2_bpos_to_text(&buf, b->data->min_key);
++	if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
++			"btree root with incorrect min_key: %s", buf.buf)) {
++		bch_err(c, "repair unimplemented");
++		ret = -BCH_ERR_fsck_repair_unimplemented;
++		goto fsck_err;
++	}
++
++	printbuf_reset(&buf);
++	bch2_bpos_to_text(&buf, b->data->max_key);
++	if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
++			"btree root with incorrect max_key: %s", buf.buf)) {
++		bch_err(c, "repair unimplemented");
++		ret = -BCH_ERR_fsck_repair_unimplemented;
++		goto fsck_err;
++	}
++
++	if (b->c.level >= target_depth)
++		ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
++
++	if (!ret) {
++		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
++
++		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
++				       &k, true);
++	}
++fsck_err:
++	six_unlock_read(&b->c.lock);
++
++	if (ret < 0)
++		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
++{
++	return  (int) btree_id_to_gc_phase(l) -
++		(int) btree_id_to_gc_phase(r);
++}
++
++static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
++{
++	struct btree_trans trans;
++	enum btree_id ids[BTREE_ID_NR];
++	unsigned i;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	if (initial)
++		trans.is_initial_gc = true;
++
++	for (i = 0; i < BTREE_ID_NR; i++)
++		ids[i] = i;
++	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
++
++	for (i = 0; i < BTREE_ID_NR && !ret; i++)
++		ret = initial
++			? bch2_gc_btree_init(&trans, ids[i], metadata_only)
++			: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
++
++	if (ret < 0)
++		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
++				  u64 start, u64 end,
++				  enum bch_data_type type,
++				  unsigned flags)
++{
++	u64 b = sector_to_bucket(ca, start);
++
++	do {
++		unsigned sectors =
++			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
++
++		bch2_mark_metadata_bucket(c, ca, b, type, sectors,
++					  gc_phase(GC_PHASE_SB), flags);
++		b++;
++		start += sectors;
++	} while (start < end);
++}
++
++static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
++				     unsigned flags)
++{
++	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
++	unsigned i;
++	u64 b;
++
++	for (i = 0; i < layout->nr_superblocks; i++) {
++		u64 offset = le64_to_cpu(layout->sb_offset[i]);
++
++		if (offset == BCH_SB_SECTOR)
++			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
++					      BCH_DATA_sb, flags);
++
++		mark_metadata_sectors(c, ca, offset,
++				      offset + (1 << layout->sb_max_size_bits),
++				      BCH_DATA_sb, flags);
++	}
++
++	for (i = 0; i < ca->journal.nr; i++) {
++		b = ca->journal.buckets[i];
++		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
++					  ca->mi.bucket_size,
++					  gc_phase(GC_PHASE_SB), flags);
++	}
++}
++
++static void bch2_mark_superblocks(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned i;
++
++	mutex_lock(&c->sb_lock);
++	gc_pos_set(c, gc_phase(GC_PHASE_SB));
++
++	for_each_online_member(ca, c, i)
++		bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
++	mutex_unlock(&c->sb_lock);
++}
++
++#if 0
++/* Also see bch2_pending_btree_node_free_insert_done() */
++static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
++{
++	struct btree_update *as;
++	struct pending_btree_node_free *d;
++
++	mutex_lock(&c->btree_interior_update_lock);
++	gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
++
++	for_each_pending_btree_node_free(c, as, d)
++		if (d->index_update_done)
++			bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
++
++	mutex_unlock(&c->btree_interior_update_lock);
++}
++#endif
++
++static void bch2_gc_free(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned i;
++
++	genradix_free(&c->reflink_gc_table);
++	genradix_free(&c->gc_stripes);
++
++	for_each_member_device(ca, c, i) {
++		kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
++			sizeof(struct bucket_array) +
++			ca->mi.nbuckets * sizeof(struct bucket));
++		ca->buckets_gc = NULL;
++
++		free_percpu(ca->usage_gc);
++		ca->usage_gc = NULL;
++	}
++
++	free_percpu(c->usage_gc);
++	c->usage_gc = NULL;
++}
++
++static int bch2_gc_done(struct bch_fs *c,
++			bool initial, bool metadata_only)
++{
++	struct bch_dev *ca = NULL;
++	struct printbuf buf = PRINTBUF;
++	bool verify = !metadata_only &&
++		!c->opts.reconstruct_alloc &&
++		(!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
++	unsigned i, dev;
++	int ret = 0;
++
++	percpu_down_write(&c->mark_lock);
++
++#define copy_field(_f, _msg, ...)					\
++	if (dst->_f != src->_f &&					\
++	    (!verify ||							\
++	     fsck_err(c, _msg ": got %llu, should be %llu"		\
++		      , ##__VA_ARGS__, dst->_f, src->_f)))		\
++		dst->_f = src->_f
++#define copy_stripe_field(_f, _msg, ...)				\
++	if (dst->_f != src->_f &&					\
++	    (!verify ||							\
++	     fsck_err(c, "stripe %zu has wrong "_msg			\
++		      ": got %u, should be %u",				\
++		      iter.pos, ##__VA_ARGS__,				\
++		      dst->_f, src->_f)))				\
++		dst->_f = src->_f
++#define copy_dev_field(_f, _msg, ...)					\
++	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
++#define copy_fs_field(_f, _msg, ...)					\
++	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
++
++	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
++		bch2_fs_usage_acc_to_base(c, i);
++
++	for_each_member_device(ca, c, dev) {
++		struct bch_dev_usage *dst = ca->usage_base;
++		struct bch_dev_usage *src = (void *)
++			bch2_acc_percpu_u64s((void *) ca->usage_gc,
++					     dev_usage_u64s());
++
++		copy_dev_field(buckets_ec,		"buckets_ec");
++
++		for (i = 0; i < BCH_DATA_NR; i++) {
++			copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
++			copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
++			copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
++		}
++	};
++
++	{
++		unsigned nr = fs_usage_u64s(c);
++		struct bch_fs_usage *dst = c->usage_base;
++		struct bch_fs_usage *src = (void *)
++			bch2_acc_percpu_u64s((void *) c->usage_gc, nr);
++
++		copy_fs_field(hidden,		"hidden");
++		copy_fs_field(btree,		"btree");
++
++		if (!metadata_only) {
++			copy_fs_field(data,	"data");
++			copy_fs_field(cached,	"cached");
++			copy_fs_field(reserved,	"reserved");
++			copy_fs_field(nr_inodes,"nr_inodes");
++
++			for (i = 0; i < BCH_REPLICAS_MAX; i++)
++				copy_fs_field(persistent_reserved[i],
++					      "persistent_reserved[%i]", i);
++		}
++
++		for (i = 0; i < c->replicas.nr; i++) {
++			struct bch_replicas_entry *e =
++				cpu_replicas_entry(&c->replicas, i);
++
++			if (metadata_only &&
++			    (e->data_type == BCH_DATA_user ||
++			     e->data_type == BCH_DATA_cached))
++				continue;
++
++			printbuf_reset(&buf);
++			bch2_replicas_entry_to_text(&buf, e);
++
++			copy_fs_field(replicas[i], "%s", buf.buf);
++		}
++	}
++
++#undef copy_fs_field
++#undef copy_dev_field
++#undef copy_stripe_field
++#undef copy_field
++fsck_err:
++	if (ca)
++		percpu_ref_put(&ca->ref);
++	if (ret)
++		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
++
++	percpu_up_write(&c->mark_lock);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int bch2_gc_start(struct bch_fs *c,
++			 bool metadata_only)
++{
++	struct bch_dev *ca = NULL;
++	unsigned i;
++
++	BUG_ON(c->usage_gc);
++
++	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
++					 sizeof(u64), GFP_KERNEL);
++	if (!c->usage_gc) {
++		bch_err(c, "error allocating c->usage_gc");
++		return -ENOMEM;
++	}
++
++	for_each_member_device(ca, c, i) {
++		BUG_ON(ca->buckets_gc);
++		BUG_ON(ca->usage_gc);
++
++		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
++		if (!ca->usage_gc) {
++			bch_err(c, "error allocating ca->usage_gc");
++			percpu_ref_put(&ca->ref);
++			return -ENOMEM;
++		}
++
++		this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
++			       ca->mi.nbuckets - ca->mi.first_bucket);
++	}
++
++	return 0;
++}
++
++/* returns true if not equal */
++static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
++				     struct bch_alloc_v4 r)
++{
++	return  l.gen != r.gen				||
++		l.oldest_gen != r.oldest_gen		||
++		l.data_type != r.data_type		||
++		l.dirty_sectors	!= r.dirty_sectors	||
++		l.cached_sectors != r.cached_sectors	 ||
++		l.stripe_redundancy != r.stripe_redundancy ||
++		l.stripe != r.stripe;
++}
++
++static int bch2_alloc_write_key(struct btree_trans *trans,
++				struct btree_iter *iter,
++				struct bkey_s_c k,
++				bool metadata_only)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
++	struct bucket gc, *b;
++	struct bkey_i_alloc_v4 *a;
++	struct bch_alloc_v4 old, new;
++	enum bch_data_type type;
++	int ret;
++
++	if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
++		return 1;
++
++	bch2_alloc_to_v4(k, &old);
++	new = old;
++
++	percpu_down_read(&c->mark_lock);
++	b = gc_bucket(ca, iter->pos.offset);
++
++	/*
++	 * b->data_type doesn't yet include need_discard & need_gc_gen states -
++	 * fix that here:
++	 */
++	type = __alloc_data_type(b->dirty_sectors,
++				 b->cached_sectors,
++				 b->stripe,
++				 old,
++				 b->data_type);
++	if (b->data_type != type) {
++		struct bch_dev_usage *u;
++
++		preempt_disable();
++		u = this_cpu_ptr(ca->usage_gc);
++		u->d[b->data_type].buckets--;
++		b->data_type = type;
++		u->d[b->data_type].buckets++;
++		preempt_enable();
++	}
++
++	gc = *b;
++	percpu_up_read(&c->mark_lock);
++
++	if (metadata_only &&
++	    gc.data_type != BCH_DATA_sb &&
++	    gc.data_type != BCH_DATA_journal &&
++	    gc.data_type != BCH_DATA_btree)
++		return 0;
++
++	if (gen_after(old.gen, gc.gen))
++		return 0;
++
++#define copy_bucket_field(_f)						\
++	if (c->opts.reconstruct_alloc ||				\
++	    fsck_err_on(new._f != gc._f, c,				\
++			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
++			": got %u, should be %u",			\
++			iter->pos.inode, iter->pos.offset,		\
++			gc.gen,						\
++			bch2_data_types[gc.data_type],			\
++			new._f, gc._f))					\
++		new._f = gc._f;						\
++
++	copy_bucket_field(gen);
++	copy_bucket_field(data_type);
++	copy_bucket_field(dirty_sectors);
++	copy_bucket_field(cached_sectors);
++	copy_bucket_field(stripe_redundancy);
++	copy_bucket_field(stripe);
++#undef copy_bucket_field
++
++	if (!bch2_alloc_v4_cmp(old, new))
++		return 0;
++
++	a = bch2_alloc_to_v4_mut(trans, k);
++	ret = PTR_ERR_OR_ZERO(a);
++	if (ret)
++		return ret;
++
++	a->v = new;
++
++	/*
++	 * The trigger normally makes sure this is set, but we're not running
++	 * triggers:
++	 */
++	if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
++		a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
++
++	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
++fsck_err:
++	return ret;
++}
++
++static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_dev *ca;
++	unsigned i;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_member_device(ca, c, i) {
++		ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
++				POS(ca->dev_idx, ca->mi.first_bucket),
++				BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
++				NULL, NULL, BTREE_INSERT_LAZY_RW,
++			bch2_alloc_write_key(&trans, &iter, k, metadata_only));
++
++		if (ret < 0) {
++			bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
++			percpu_ref_put(&ca->ref);
++			break;
++		}
++	}
++
++	bch2_trans_exit(&trans);
++	return ret < 0 ? ret : 0;
++}
++
++static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
++{
++	struct bch_dev *ca;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bucket *g;
++	struct bch_alloc_v4 a;
++	unsigned i;
++	int ret;
++
++	for_each_member_device(ca, c, i) {
++		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
++				ca->mi.nbuckets * sizeof(struct bucket),
++				GFP_KERNEL|__GFP_ZERO);
++		if (!buckets) {
++			percpu_ref_put(&ca->ref);
++			bch_err(c, "error allocating ca->buckets[gc]");
++			return -ENOMEM;
++		}
++
++		buckets->first_bucket	= ca->mi.first_bucket;
++		buckets->nbuckets	= ca->mi.nbuckets;
++		rcu_assign_pointer(ca->buckets_gc, buckets);
++	};
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		ca = bch_dev_bkey_exists(c, k.k->p.inode);
++		g = gc_bucket(ca, k.k->p.offset);
++
++		bch2_alloc_to_v4(k, &a);
++
++		g->gen_valid	= 1;
++		g->gen		= a.gen;
++
++		if (metadata_only &&
++		    (a.data_type == BCH_DATA_user ||
++		     a.data_type == BCH_DATA_cached ||
++		     a.data_type == BCH_DATA_parity)) {
++			g->data_type		= a.data_type;
++			g->dirty_sectors	= a.dirty_sectors;
++			g->cached_sectors	= a.cached_sectors;
++			g->stripe		= a.stripe;
++			g->stripe_redundancy	= a.stripe_redundancy;
++		}
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
++
++	return ret;
++}
++
++static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
++{
++	struct bch_dev *ca;
++	unsigned i;
++
++	for_each_member_device(ca, c, i) {
++		struct bucket_array *buckets = gc_bucket_array(ca);
++		struct bucket *g;
++
++		for_each_bucket(g, buckets) {
++			if (metadata_only &&
++			    (g->data_type == BCH_DATA_user ||
++			     g->data_type == BCH_DATA_cached ||
++			     g->data_type == BCH_DATA_parity))
++				continue;
++			g->data_type = 0;
++			g->dirty_sectors = 0;
++			g->cached_sectors = 0;
++		}
++	};
++}
++
++static int bch2_gc_write_reflink_key(struct btree_trans *trans,
++				     struct btree_iter *iter,
++				     struct bkey_s_c k,
++				     size_t *idx)
++{
++	struct bch_fs *c = trans->c;
++	const __le64 *refcount = bkey_refcount_c(k);
++	struct printbuf buf = PRINTBUF;
++	struct reflink_gc *r;
++	int ret = 0;
++
++	if (!refcount)
++		return 0;
++
++	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
++	       r->offset < k.k->p.offset)
++		++*idx;
++
++	if (!r ||
++	    r->offset != k.k->p.offset ||
++	    r->size != k.k->size) {
++		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
++		return -EINVAL;
++	}
++
++	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
++			"reflink key has wrong refcount:\n"
++			"  %s\n"
++			"  should be %u",
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
++			r->refcount)) {
++		struct bkey_i *new;
++
++		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++		ret = PTR_ERR_OR_ZERO(new);
++		if (ret)
++			return ret;
++
++		bkey_reassemble(new, k);
++
++		if (!r->refcount)
++			new->k.type = KEY_TYPE_deleted;
++		else
++			*bkey_refcount(new) = cpu_to_le64(r->refcount);
++
++		ret = bch2_trans_update(trans, iter, new, 0);
++	}
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	size_t idx = 0;
++	int ret = 0;
++
++	if (metadata_only)
++		return 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_reflink, POS_MIN,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL,
++		bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
++
++	c->reflink_gc_nr = 0;
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int bch2_gc_reflink_start(struct bch_fs *c,
++				 bool metadata_only)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct reflink_gc *r;
++	int ret = 0;
++
++	if (metadata_only)
++		return 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	c->reflink_gc_nr = 0;
++
++	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		const __le64 *refcount = bkey_refcount_c(k);
++
++		if (!refcount)
++			continue;
++
++		r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
++				       GFP_KERNEL);
++		if (!r) {
++			ret = -ENOMEM;
++			break;
++		}
++
++		r->offset	= k.k->p.offset;
++		r->size		= k.k->size;
++		r->refcount	= 0;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
++{
++	struct genradix_iter iter;
++	struct reflink_gc *r;
++
++	genradix_for_each(&c->reflink_gc_table, iter, r)
++		r->refcount = 0;
++}
++
++static int bch2_gc_write_stripes_key(struct btree_trans *trans,
++				     struct btree_iter *iter,
++				     struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	struct printbuf buf = PRINTBUF;
++	const struct bch_stripe *s;
++	struct gc_stripe *m;
++	unsigned i;
++	int ret = 0;
++
++	if (k.k->type != KEY_TYPE_stripe)
++		return 0;
++
++	s = bkey_s_c_to_stripe(k).v;
++	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
++
++	for (i = 0; i < s->nr_blocks; i++)
++		if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
++			goto inconsistent;
++	return 0;
++inconsistent:
++	if (fsck_err_on(true, c,
++			"stripe has wrong block sector count %u:\n"
++			"  %s\n"
++			"  should be %u", i,
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
++			m ? m->block_sectors[i] : 0)) {
++		struct bkey_i_stripe *new;
++
++		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++		ret = PTR_ERR_OR_ZERO(new);
++		if (ret)
++			return ret;
++
++		bkey_reassemble(&new->k_i, k);
++
++		for (i = 0; i < new->v.nr_blocks; i++)
++			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
++
++		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
++	}
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	if (metadata_only)
++		return 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_stripes, POS_MIN,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL,
++		bch2_gc_write_stripes_key(&trans, &iter, k));
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
++{
++	genradix_free(&c->gc_stripes);
++}
++
++/**
++ * bch2_gc - walk _all_ references to buckets, and recompute them:
++ *
++ * Order matters here:
++ *  - Concurrent GC relies on the fact that we have a total ordering for
++ *    everything that GC walks - see  gc_will_visit_node(),
++ *    gc_will_visit_root()
++ *
++ *  - also, references move around in the course of index updates and
++ *    various other crap: everything needs to agree on the ordering
++ *    references are allowed to move around in - e.g., we're allowed to
++ *    start with a reference owned by an open_bucket (the allocator) and
++ *    move it to the btree, but not the reverse.
++ *
++ *    This is necessary to ensure that gc doesn't miss references that
++ *    move around - if references move backwards in the ordering GC
++ *    uses, GC could skip past them
++ */
++int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
++{
++	unsigned iter = 0;
++	int ret;
++
++	lockdep_assert_held(&c->state_lock);
++
++	down_write(&c->gc_lock);
++
++	bch2_btree_interior_updates_flush(c);
++
++	ret   = bch2_gc_start(c, metadata_only) ?:
++		bch2_gc_alloc_start(c, metadata_only) ?:
++		bch2_gc_reflink_start(c, metadata_only);
++	if (ret)
++		goto out;
++again:
++	gc_pos_set(c, gc_phase(GC_PHASE_START));
++
++	bch2_mark_superblocks(c);
++
++	if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
++	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
++	    c->opts.fix_errors != FSCK_OPT_NO) {
++		bch_info(c, "Starting topology repair pass");
++		ret = bch2_repair_topology(c);
++		if (ret)
++			goto out;
++		bch_info(c, "Topology repair pass done");
++
++		set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
++	}
++
++	ret = bch2_gc_btrees(c, initial, metadata_only);
++
++	if (ret == -BCH_ERR_need_topology_repair &&
++	    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
++	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
++		set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
++		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true);
++		ret = 0;
++	}
++
++	if (ret == -BCH_ERR_need_topology_repair)
++		ret = -BCH_ERR_fsck_errors_not_fixed;
++
++	if (ret)
++		goto out;
++
++#if 0
++	bch2_mark_pending_btree_node_frees(c);
++#endif
++	c->gc_count++;
++
++	if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
++	    (!iter && bch2_test_restart_gc)) {
++		if (iter++ > 2) {
++			bch_info(c, "Unable to fix bucket gens, looping");
++			ret = -EINVAL;
++			goto out;
++		}
++
++		/*
++		 * XXX: make sure gens we fixed got saved
++		 */
++		bch_info(c, "Second GC pass needed, restarting:");
++		clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
++		__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
++
++		bch2_gc_stripes_reset(c, metadata_only);
++		bch2_gc_alloc_reset(c, metadata_only);
++		bch2_gc_reflink_reset(c, metadata_only);
++
++		/* flush fsck errors, reset counters */
++		bch2_flush_fsck_errs(c);
++		goto again;
++	}
++out:
++	if (!ret) {
++		bch2_journal_block(&c->journal);
++
++		ret   = bch2_gc_stripes_done(c, metadata_only) ?:
++			bch2_gc_reflink_done(c, metadata_only) ?:
++			bch2_gc_alloc_done(c, metadata_only) ?:
++			bch2_gc_done(c, initial, metadata_only);
++
++		bch2_journal_unblock(&c->journal);
++	}
++
++	percpu_down_write(&c->mark_lock);
++	/* Indicates that gc is no longer in progress: */
++	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
++
++	bch2_gc_free(c);
++	percpu_up_write(&c->mark_lock);
++
++	up_write(&c->gc_lock);
++
++	/*
++	 * At startup, allocations can happen directly instead of via the
++	 * allocator thread - issue wakeup in case they blocked on gc_lock:
++	 */
++	closure_wake_up(&c->freelist_wait);
++	return ret;
++}
++
++static int gc_btree_gens_key(struct btree_trans *trans,
++			     struct btree_iter *iter,
++			     struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr;
++	struct bkey_i *u;
++	int ret;
++
++	percpu_down_read(&c->mark_lock);
++	bkey_for_each_ptr(ptrs, ptr) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++
++		if (ptr_stale(ca, ptr) > 16) {
++			percpu_up_read(&c->mark_lock);
++			goto update;
++		}
++	}
++
++	bkey_for_each_ptr(ptrs, ptr) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++		u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
++
++		if (gen_after(*gen, ptr->gen))
++			*gen = ptr->gen;
++	}
++	percpu_up_read(&c->mark_lock);
++	return 0;
++update:
++	u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(u);
++	if (ret)
++		return ret;
++
++	bkey_reassemble(u, k);
++
++	bch2_extent_normalize(c, bkey_i_to_s(u));
++	return bch2_trans_update(trans, iter, u, 0);
++}
++
++static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
++				       struct bkey_s_c k)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
++	struct bch_alloc_v4 a;
++	struct bkey_i_alloc_v4 *a_mut;
++	int ret;
++
++	bch2_alloc_to_v4(k, &a);
++
++	if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
++		return 0;
++
++	a_mut = bch2_alloc_to_v4_mut(trans, k);
++	ret = PTR_ERR_OR_ZERO(a_mut);
++	if (ret)
++		return ret;
++
++	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
++	a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
++
++	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
++}
++
++int bch2_gc_gens(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_dev *ca;
++	u64 b, start_time = local_clock();
++	unsigned i;
++	int ret;
++
++	/*
++	 * Ideally we would be using state_lock and not gc_lock here, but that
++	 * introduces a deadlock in the RO path - we currently take the state
++	 * lock at the start of going RO, thus the gc thread may get stuck:
++	 */
++	if (!mutex_trylock(&c->gc_gens_lock))
++		return 0;
++
++	trace_gc_gens_start(c);
++	down_read(&c->gc_lock);
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_member_device(ca, c, i) {
++		struct bucket_gens *gens;
++
++		BUG_ON(ca->oldest_gen);
++
++		ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
++		if (!ca->oldest_gen) {
++			percpu_ref_put(&ca->ref);
++			ret = -ENOMEM;
++			goto err;
++		}
++
++		gens = bucket_gens(ca);
++
++		for (b = gens->first_bucket;
++		     b < gens->nbuckets; b++)
++			ca->oldest_gen[b] = gens->b[b];
++	}
++
++	for (i = 0; i < BTREE_ID_NR; i++)
++		if ((1 << i) & BTREE_ID_HAS_PTRS) {
++			struct btree_iter iter;
++			struct bkey_s_c k;
++
++			c->gc_gens_btree = i;
++			c->gc_gens_pos = POS_MIN;
++			ret = for_each_btree_key_commit(&trans, iter, i,
++					POS_MIN,
++					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
++					k,
++					NULL, NULL,
++					BTREE_INSERT_NOFAIL,
++				gc_btree_gens_key(&trans, &iter, k));
++			if (ret) {
++				bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
++				goto err;
++			}
++		}
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
++			POS_MIN,
++			BTREE_ITER_PREFETCH,
++			k,
++			NULL, NULL,
++			BTREE_INSERT_NOFAIL,
++		bch2_alloc_write_oldest_gen(&trans, &iter, k));
++	if (ret) {
++		bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	c->gc_gens_btree	= 0;
++	c->gc_gens_pos		= POS_MIN;
++
++	c->gc_count++;
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
++	trace_gc_gens_end(c);
++err:
++	for_each_member_device(ca, c, i) {
++		kvfree(ca->oldest_gen);
++		ca->oldest_gen = NULL;
++	}
++
++	bch2_trans_exit(&trans);
++	up_read(&c->gc_lock);
++	mutex_unlock(&c->gc_gens_lock);
++	return ret;
++}
++
++static int bch2_gc_thread(void *arg)
++{
++	struct bch_fs *c = arg;
++	struct io_clock *clock = &c->io_clock[WRITE];
++	unsigned long last = atomic64_read(&clock->now);
++	unsigned last_kick = atomic_read(&c->kick_gc);
++	int ret;
++
++	set_freezable();
++
++	while (1) {
++		while (1) {
++			set_current_state(TASK_INTERRUPTIBLE);
++
++			if (kthread_should_stop()) {
++				__set_current_state(TASK_RUNNING);
++				return 0;
++			}
++
++			if (atomic_read(&c->kick_gc) != last_kick)
++				break;
++
++			if (c->btree_gc_periodic) {
++				unsigned long next = last + c->capacity / 16;
++
++				if (atomic64_read(&clock->now) >= next)
++					break;
++
++				bch2_io_clock_schedule_timeout(clock, next);
++			} else {
++				schedule();
++			}
++
++			try_to_freeze();
++		}
++		__set_current_state(TASK_RUNNING);
++
++		last = atomic64_read(&clock->now);
++		last_kick = atomic_read(&c->kick_gc);
++
++		/*
++		 * Full gc is currently incompatible with btree key cache:
++		 */
++#if 0
++		ret = bch2_gc(c, false, false);
++#else
++		ret = bch2_gc_gens(c);
++#endif
++		if (ret < 0)
++			bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
++
++		debug_check_no_locks_held();
++	}
++
++	return 0;
++}
++
++void bch2_gc_thread_stop(struct bch_fs *c)
++{
++	struct task_struct *p;
++
++	p = c->gc_thread;
++	c->gc_thread = NULL;
++
++	if (p) {
++		kthread_stop(p);
++		put_task_struct(p);
++	}
++}
++
++int bch2_gc_thread_start(struct bch_fs *c)
++{
++	struct task_struct *p;
++
++	if (c->gc_thread)
++		return 0;
++
++	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
++	if (IS_ERR(p)) {
++		bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
++		return PTR_ERR(p);
++	}
++
++	get_task_struct(p);
++	c->gc_thread = p;
++	wake_up_process(p);
++	return 0;
++}
+diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
+new file mode 100644
+index 000000000000..95d803b5743d
+--- /dev/null
++++ b/fs/bcachefs/btree_gc.h
+@@ -0,0 +1,112 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BTREE_GC_H
++#define _BCACHEFS_BTREE_GC_H
++
++#include "btree_types.h"
++
++int bch2_gc(struct bch_fs *, bool, bool);
++int bch2_gc_gens(struct bch_fs *);
++void bch2_gc_thread_stop(struct bch_fs *);
++int bch2_gc_thread_start(struct bch_fs *);
++
++/*
++ * For concurrent mark and sweep (with other index updates), we define a total
++ * ordering of _all_ references GC walks:
++ *
++ * Note that some references will have the same GC position as others - e.g.
++ * everything within the same btree node; in those cases we're relying on
++ * whatever locking exists for where those references live, i.e. the write lock
++ * on a btree node.
++ *
++ * That locking is also required to ensure GC doesn't pass the updater in
++ * between the updater adding/removing the reference and updating the GC marks;
++ * without that, we would at best double count sometimes.
++ *
++ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
++ * be held that prevents GC from passing the position the updater is at.
++ *
++ * (What about the start of gc, when we're clearing all the marks? GC clears the
++ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
++ * position inside its cmpxchg loop, so crap magically works).
++ */
++
++/* Position of (the start of) a gc phase: */
++static inline struct gc_pos gc_phase(enum gc_phase phase)
++{
++	return (struct gc_pos) {
++		.phase	= phase,
++		.pos	= POS_MIN,
++		.level	= 0,
++	};
++}
++
++static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
++{
++	return  cmp_int(l.phase, r.phase) ?:
++		bpos_cmp(l.pos, r.pos) ?:
++		cmp_int(l.level, r.level);
++}
++
++static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
++{
++	switch (id) {
++#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
++	BCH_BTREE_IDS()
++#undef x
++	default:
++		BUG();
++	}
++}
++
++static inline struct gc_pos gc_pos_btree(enum btree_id id,
++					 struct bpos pos, unsigned level)
++{
++	return (struct gc_pos) {
++		.phase	= btree_id_to_gc_phase(id),
++		.pos	= pos,
++		.level	= level,
++	};
++}
++
++/*
++ * GC position of the pointers within a btree node: note, _not_ for &b->key
++ * itself, that lives in the parent node:
++ */
++static inline struct gc_pos gc_pos_btree_node(struct btree *b)
++{
++	return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
++}
++
++/*
++ * GC position of the pointer to a btree root: we don't use
++ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
++ * btree_split() increasing the tree depth - the new root will have level > the
++ * old root and thus have a greater gc position than the old root, but that
++ * would be incorrect since once gc has marked the root it's not coming back.
++ */
++static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
++{
++	return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH);
++}
++
++static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
++{
++	unsigned seq;
++	bool ret;
++
++	do {
++		seq = read_seqcount_begin(&c->gc_pos_lock);
++		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
++	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
++
++	return ret;
++}
++
++static inline void bch2_do_gc_gens(struct bch_fs *c)
++{
++	atomic_inc(&c->kick_gc);
++	if (c->gc_thread)
++		wake_up_process(c->gc_thread);
++}
++
++#endif /* _BCACHEFS_BTREE_GC_H */
+diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
+new file mode 100644
+index 000000000000..ae731b3a3908
+--- /dev/null
++++ b/fs/bcachefs/btree_io.c
+@@ -0,0 +1,2150 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "bkey_methods.h"
++#include "bkey_sort.h"
++#include "btree_cache.h"
++#include "btree_io.h"
++#include "btree_iter.h"
++#include "btree_locking.h"
++#include "btree_update.h"
++#include "btree_update_interior.h"
++#include "buckets.h"
++#include "checksum.h"
++#include "debug.h"
++#include "error.h"
++#include "extents.h"
++#include "io.h"
++#include "journal_reclaim.h"
++#include "journal_seq_blacklist.h"
++#include "super-io.h"
++
++#include <linux/sched/mm.h>
++#include <trace/events/bcachefs.h>
++
++void bch2_btree_node_io_unlock(struct btree *b)
++{
++	EBUG_ON(!btree_node_write_in_flight(b));
++
++	clear_btree_node_write_in_flight_inner(b);
++	clear_btree_node_write_in_flight(b);
++	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
++}
++
++void bch2_btree_node_io_lock(struct btree *b)
++{
++	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
++
++	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
++			    TASK_UNINTERRUPTIBLE);
++}
++
++void __bch2_btree_node_wait_on_read(struct btree *b)
++{
++	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
++		       TASK_UNINTERRUPTIBLE);
++}
++
++void __bch2_btree_node_wait_on_write(struct btree *b)
++{
++	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
++		       TASK_UNINTERRUPTIBLE);
++}
++
++void bch2_btree_node_wait_on_read(struct btree *b)
++{
++	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
++
++	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
++		       TASK_UNINTERRUPTIBLE);
++}
++
++void bch2_btree_node_wait_on_write(struct btree *b)
++{
++	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
++
++	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
++		       TASK_UNINTERRUPTIBLE);
++}
++
++static void verify_no_dups(struct btree *b,
++			   struct bkey_packed *start,
++			   struct bkey_packed *end)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	struct bkey_packed *k, *p;
++
++	if (start == end)
++		return;
++
++	for (p = start, k = bkey_next(start);
++	     k != end;
++	     p = k, k = bkey_next(k)) {
++		struct bkey l = bkey_unpack_key(b, p);
++		struct bkey r = bkey_unpack_key(b, k);
++
++		BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
++	}
++#endif
++}
++
++static void set_needs_whiteout(struct bset *i, int v)
++{
++	struct bkey_packed *k;
++
++	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
++		k->needs_whiteout = v;
++}
++
++static void btree_bounce_free(struct bch_fs *c, size_t size,
++			      bool used_mempool, void *p)
++{
++	if (used_mempool)
++		mempool_free(p, &c->btree_bounce_pool);
++	else
++		vpfree(p, size);
++}
++
++static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
++				bool *used_mempool)
++{
++	unsigned flags = memalloc_nofs_save();
++	void *p;
++
++	BUG_ON(size > btree_bytes(c));
++
++	*used_mempool = false;
++	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
++	if (!p) {
++		*used_mempool = true;
++		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
++	}
++	memalloc_nofs_restore(flags);
++	return p;
++}
++
++static void sort_bkey_ptrs(const struct btree *bt,
++			   struct bkey_packed **ptrs, unsigned nr)
++{
++	unsigned n = nr, a = nr / 2, b, c, d;
++
++	if (!a)
++		return;
++
++	/* Heap sort: see lib/sort.c: */
++	while (1) {
++		if (a)
++			a--;
++		else if (--n)
++			swap(ptrs[0], ptrs[n]);
++		else
++			break;
++
++		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
++			b = bch2_bkey_cmp_packed(bt,
++					    ptrs[c],
++					    ptrs[d]) >= 0 ? c : d;
++		if (d == n)
++			b = c;
++
++		while (b != a &&
++		       bch2_bkey_cmp_packed(bt,
++				       ptrs[a],
++				       ptrs[b]) >= 0)
++			b = (b - 1) / 2;
++		c = b;
++		while (b != a) {
++			b = (b - 1) / 2;
++			swap(ptrs[b], ptrs[c]);
++		}
++	}
++}
++
++static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
++{
++	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
++	bool used_mempool = false;
++	size_t bytes = b->whiteout_u64s * sizeof(u64);
++
++	if (!b->whiteout_u64s)
++		return;
++
++	new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
++
++	ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
++
++	for (k = unwritten_whiteouts_start(c, b);
++	     k != unwritten_whiteouts_end(c, b);
++	     k = bkey_next(k))
++		*--ptrs = k;
++
++	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
++
++	k = new_whiteouts;
++
++	while (ptrs != ptrs_end) {
++		bkey_copy(k, *ptrs);
++		k = bkey_next(k);
++		ptrs++;
++	}
++
++	verify_no_dups(b, new_whiteouts,
++		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
++
++	memcpy_u64s(unwritten_whiteouts_start(c, b),
++		    new_whiteouts, b->whiteout_u64s);
++
++	btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
++}
++
++static bool should_compact_bset(struct btree *b, struct bset_tree *t,
++				bool compacting, enum compact_mode mode)
++{
++	if (!bset_dead_u64s(b, t))
++		return false;
++
++	switch (mode) {
++	case COMPACT_LAZY:
++		return should_compact_bset_lazy(b, t) ||
++			(compacting && !bset_written(b, bset(b, t)));
++	case COMPACT_ALL:
++		return true;
++	default:
++		BUG();
++	}
++}
++
++static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
++{
++	struct bset_tree *t;
++	bool ret = false;
++
++	for_each_bset(b, t) {
++		struct bset *i = bset(b, t);
++		struct bkey_packed *k, *n, *out, *start, *end;
++		struct btree_node_entry *src = NULL, *dst = NULL;
++
++		if (t != b->set && !bset_written(b, i)) {
++			src = container_of(i, struct btree_node_entry, keys);
++			dst = max(write_block(b),
++				  (void *) btree_bkey_last(b, t - 1));
++		}
++
++		if (src != dst)
++			ret = true;
++
++		if (!should_compact_bset(b, t, ret, mode)) {
++			if (src != dst) {
++				memmove(dst, src, sizeof(*src) +
++					le16_to_cpu(src->keys.u64s) *
++					sizeof(u64));
++				i = &dst->keys;
++				set_btree_bset(b, t, i);
++			}
++			continue;
++		}
++
++		start	= btree_bkey_first(b, t);
++		end	= btree_bkey_last(b, t);
++
++		if (src != dst) {
++			memmove(dst, src, sizeof(*src));
++			i = &dst->keys;
++			set_btree_bset(b, t, i);
++		}
++
++		out = i->start;
++
++		for (k = start; k != end; k = n) {
++			n = bkey_next(k);
++
++			if (!bkey_deleted(k)) {
++				bkey_copy(out, k);
++				out = bkey_next(out);
++			} else {
++				BUG_ON(k->needs_whiteout);
++			}
++		}
++
++		i->u64s = cpu_to_le16((u64 *) out - i->_data);
++		set_btree_bset_end(b, t);
++		bch2_bset_set_no_aux_tree(b, t);
++		ret = true;
++	}
++
++	bch2_verify_btree_nr_keys(b);
++
++	bch2_btree_build_aux_trees(b);
++
++	return ret;
++}
++
++bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
++			    enum compact_mode mode)
++{
++	return bch2_drop_whiteouts(b, mode);
++}
++
++static void btree_node_sort(struct bch_fs *c, struct btree *b,
++			    unsigned start_idx,
++			    unsigned end_idx,
++			    bool filter_whiteouts)
++{
++	struct btree_node *out;
++	struct sort_iter sort_iter;
++	struct bset_tree *t;
++	struct bset *start_bset = bset(b, &b->set[start_idx]);
++	bool used_mempool = false;
++	u64 start_time, seq = 0;
++	unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
++	bool sorting_entire_node = start_idx == 0 &&
++		end_idx == b->nsets;
++
++	sort_iter_init(&sort_iter, b);
++
++	for (t = b->set + start_idx;
++	     t < b->set + end_idx;
++	     t++) {
++		u64s += le16_to_cpu(bset(b, t)->u64s);
++		sort_iter_add(&sort_iter,
++			      btree_bkey_first(b, t),
++			      btree_bkey_last(b, t));
++	}
++
++	bytes = sorting_entire_node
++		? btree_bytes(c)
++		: __vstruct_bytes(struct btree_node, u64s);
++
++	out = btree_bounce_alloc(c, bytes, &used_mempool);
++
++	start_time = local_clock();
++
++	u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
++
++	out->keys.u64s = cpu_to_le16(u64s);
++
++	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
++
++	if (sorting_entire_node)
++		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
++				       start_time);
++
++	/* Make sure we preserve bset journal_seq: */
++	for (t = b->set + start_idx; t < b->set + end_idx; t++)
++		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
++	start_bset->journal_seq = cpu_to_le64(seq);
++
++	if (sorting_entire_node) {
++		unsigned u64s = le16_to_cpu(out->keys.u64s);
++
++		BUG_ON(bytes != btree_bytes(c));
++
++		/*
++		 * Our temporary buffer is the same size as the btree node's
++		 * buffer, we can just swap buffers instead of doing a big
++		 * memcpy()
++		 */
++		*out = *b->data;
++		out->keys.u64s = cpu_to_le16(u64s);
++		swap(out, b->data);
++		set_btree_bset(b, b->set, &b->data->keys);
++	} else {
++		start_bset->u64s = out->keys.u64s;
++		memcpy_u64s(start_bset->start,
++			    out->keys.start,
++			    le16_to_cpu(out->keys.u64s));
++	}
++
++	for (i = start_idx + 1; i < end_idx; i++)
++		b->nr.bset_u64s[start_idx] +=
++			b->nr.bset_u64s[i];
++
++	b->nsets -= shift;
++
++	for (i = start_idx + 1; i < b->nsets; i++) {
++		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
++		b->set[i]		= b->set[i + shift];
++	}
++
++	for (i = b->nsets; i < MAX_BSETS; i++)
++		b->nr.bset_u64s[i] = 0;
++
++	set_btree_bset_end(b, &b->set[start_idx]);
++	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
++
++	btree_bounce_free(c, bytes, used_mempool, out);
++
++	bch2_verify_btree_nr_keys(b);
++}
++
++void bch2_btree_sort_into(struct bch_fs *c,
++			 struct btree *dst,
++			 struct btree *src)
++{
++	struct btree_nr_keys nr;
++	struct btree_node_iter src_iter;
++	u64 start_time = local_clock();
++
++	BUG_ON(dst->nsets != 1);
++
++	bch2_bset_set_no_aux_tree(dst, dst->set);
++
++	bch2_btree_node_iter_init_from_start(&src_iter, src);
++
++	nr = bch2_sort_repack(btree_bset_first(dst),
++			src, &src_iter,
++			&dst->format,
++			true);
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
++			       start_time);
++
++	set_btree_bset_end(dst, dst->set);
++
++	dst->nr.live_u64s	+= nr.live_u64s;
++	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
++	dst->nr.packed_keys	+= nr.packed_keys;
++	dst->nr.unpacked_keys	+= nr.unpacked_keys;
++
++	bch2_verify_btree_nr_keys(dst);
++}
++
++#define SORT_CRIT	(4096 / sizeof(u64))
++
++/*
++ * We're about to add another bset to the btree node, so if there's currently
++ * too many bsets - sort some of them together:
++ */
++static bool btree_node_compact(struct bch_fs *c, struct btree *b)
++{
++	unsigned unwritten_idx;
++	bool ret = false;
++
++	for (unwritten_idx = 0;
++	     unwritten_idx < b->nsets;
++	     unwritten_idx++)
++		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
++			break;
++
++	if (b->nsets - unwritten_idx > 1) {
++		btree_node_sort(c, b, unwritten_idx,
++				b->nsets, false);
++		ret = true;
++	}
++
++	if (unwritten_idx > 1) {
++		btree_node_sort(c, b, 0, unwritten_idx, false);
++		ret = true;
++	}
++
++	return ret;
++}
++
++void bch2_btree_build_aux_trees(struct btree *b)
++{
++	struct bset_tree *t;
++
++	for_each_bset(b, t)
++		bch2_bset_build_aux_tree(b, t,
++				!bset_written(b, bset(b, t)) &&
++				t == bset_tree_last(b));
++}
++
++/*
++ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
++ * inserted into
++ *
++ * Safe to call if there already is an unwritten bset - will only add a new bset
++ * if @b doesn't already have one.
++ *
++ * Returns true if we sorted (i.e. invalidated iterators
++ */
++void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_node_entry *bne;
++	bool reinit_iter = false;
++
++	EBUG_ON(!(b->c.lock.state.seq & 1));
++	BUG_ON(bset_written(b, bset(b, &b->set[1])));
++
++	if (b->nsets == MAX_BSETS &&
++	    !btree_node_write_in_flight(b)) {
++		unsigned log_u64s[] = {
++			ilog2(bset_u64s(&b->set[0])),
++			ilog2(bset_u64s(&b->set[1])),
++			ilog2(bset_u64s(&b->set[2])),
++		};
++
++		if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
++			bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
++			reinit_iter = true;
++		}
++	}
++
++	if (b->nsets == MAX_BSETS &&
++	    btree_node_compact(c, b))
++		reinit_iter = true;
++
++	BUG_ON(b->nsets >= MAX_BSETS);
++
++	bne = want_new_bset(c, b);
++	if (bne)
++		bch2_bset_init_next(c, b, bne);
++
++	bch2_btree_build_aux_trees(b);
++
++	if (reinit_iter)
++		bch2_trans_node_reinit_iter(trans, b);
++}
++
++static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
++			  struct btree *b)
++{
++	prt_printf(out, "%s level %u/%u\n  ",
++	       bch2_btree_ids[b->c.btree_id],
++	       b->c.level,
++	       c->btree_roots[b->c.btree_id].level);
++	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
++}
++
++static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
++			  struct bch_dev *ca,
++			  struct btree *b, struct bset *i,
++			  unsigned offset, int write)
++{
++	prt_printf(out, "error validating btree node ");
++	if (write)
++		prt_printf(out, "before write ");
++	if (ca)
++		prt_printf(out, "on %s ", ca->name);
++	prt_printf(out, "at btree ");
++	btree_pos_to_text(out, c, b);
++
++	prt_printf(out, "\n  node offset %u", b->written);
++	if (i)
++		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
++}
++
++enum btree_err_type {
++	BTREE_ERR_FIXABLE,
++	BTREE_ERR_WANT_RETRY,
++	BTREE_ERR_MUST_RETRY,
++	BTREE_ERR_FATAL,
++};
++
++enum btree_validate_ret {
++	BTREE_RETRY_READ = 64,
++};
++
++#define btree_err(type, c, ca, b, i, msg, ...)				\
++({									\
++	__label__ out;							\
++	struct printbuf out = PRINTBUF;					\
++									\
++	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
++	prt_printf(&out, ": " msg, ##__VA_ARGS__);			\
++									\
++	if (type == BTREE_ERR_FIXABLE &&				\
++	    write == READ &&						\
++	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
++		mustfix_fsck_err(c, "%s", out.buf);			\
++		goto out;						\
++	}								\
++									\
++	switch (write) {						\
++	case READ:							\
++		bch_err(c, "%s", out.buf);				\
++									\
++		switch (type) {						\
++		case BTREE_ERR_FIXABLE:					\
++			ret = -BCH_ERR_fsck_errors_not_fixed;		\
++			goto fsck_err;					\
++		case BTREE_ERR_WANT_RETRY:				\
++			if (have_retry) {				\
++				ret = BTREE_RETRY_READ;			\
++				goto fsck_err;				\
++			}						\
++			break;						\
++		case BTREE_ERR_MUST_RETRY:				\
++			ret = BTREE_RETRY_READ;				\
++			goto fsck_err;					\
++		case BTREE_ERR_FATAL:					\
++			ret = -BCH_ERR_fsck_errors_not_fixed;		\
++			goto fsck_err;					\
++		}							\
++		break;							\
++	case WRITE:							\
++		bch_err(c, "corrupt metadata before write: %s", out.buf);\
++									\
++		if (bch2_fs_inconsistent(c)) {				\
++			ret = -BCH_ERR_fsck_errors_not_fixed;		\
++			goto fsck_err;					\
++		}							\
++		break;							\
++	}								\
++out:									\
++	printbuf_exit(&out);						\
++	true;								\
++})
++
++#define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
++
++/*
++ * When btree topology repair changes the start or end of a node, that might
++ * mean we have to drop keys that are no longer inside the node:
++ */
++void bch2_btree_node_drop_keys_outside_node(struct btree *b)
++{
++	struct bset_tree *t;
++	struct bkey_s_c k;
++	struct bkey unpacked;
++	struct btree_node_iter iter;
++
++	for_each_bset(b, t) {
++		struct bset *i = bset(b, t);
++		struct bkey_packed *k;
++
++		for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
++			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
++				break;
++
++		if (k != i->start) {
++			unsigned shift = (u64 *) k - (u64 *) i->start;
++
++			memmove_u64s_down(i->start, k,
++					  (u64 *) vstruct_end(i) - (u64 *) k);
++			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
++			set_btree_bset_end(b, t);
++			bch2_bset_set_no_aux_tree(b, t);
++		}
++
++		for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
++			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
++				break;
++
++		if (k != vstruct_last(i)) {
++			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
++			set_btree_bset_end(b, t);
++			bch2_bset_set_no_aux_tree(b, t);
++		}
++	}
++
++	bch2_btree_build_aux_trees(b);
++
++	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
++		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
++		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
++	}
++}
++
++static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
++			 struct btree *b, struct bset *i,
++			 unsigned offset, unsigned sectors,
++			 int write, bool have_retry)
++{
++	unsigned version = le16_to_cpu(i->version);
++	const char *err;
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++	int ret = 0;
++
++	btree_err_on((version != BCH_BSET_VERSION_OLD &&
++		      version < bcachefs_metadata_version_min) ||
++		     version >= bcachefs_metadata_version_max,
++		     BTREE_ERR_FATAL, c, ca, b, i,
++		     "unsupported bset version");
++
++	if (btree_err_on(version < c->sb.version_min,
++			 BTREE_ERR_FIXABLE, c, NULL, b, i,
++			 "bset version %u older than superblock version_min %u",
++			 version, c->sb.version_min)) {
++		mutex_lock(&c->sb_lock);
++		c->disk_sb.sb->version_min = cpu_to_le16(version);
++		bch2_write_super(c);
++		mutex_unlock(&c->sb_lock);
++	}
++
++	if (btree_err_on(version > c->sb.version,
++			 BTREE_ERR_FIXABLE, c, NULL, b, i,
++			 "bset version %u newer than superblock version %u",
++			 version, c->sb.version)) {
++		mutex_lock(&c->sb_lock);
++		c->disk_sb.sb->version = cpu_to_le16(version);
++		bch2_write_super(c);
++		mutex_unlock(&c->sb_lock);
++	}
++
++	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
++		     BTREE_ERR_FATAL, c, ca, b, i,
++		     "BSET_SEPARATE_WHITEOUTS no longer supported");
++
++	if (btree_err_on(offset + sectors > btree_sectors(c),
++			 BTREE_ERR_FIXABLE, c, ca, b, i,
++			 "bset past end of btree node")) {
++		i->u64s = 0;
++		ret = 0;
++		goto out;
++	}
++
++	btree_err_on(offset && !i->u64s,
++		     BTREE_ERR_FIXABLE, c, ca, b, i,
++		     "empty bset");
++
++	btree_err_on(BSET_OFFSET(i) &&
++		     BSET_OFFSET(i) != offset,
++		     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++		     "bset at wrong sector offset");
++
++	if (!offset) {
++		struct btree_node *bn =
++			container_of(i, struct btree_node, keys);
++		/* These indicate that we read the wrong btree node: */
++
++		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
++			struct bch_btree_ptr_v2 *bp =
++				&bkey_i_to_btree_ptr_v2(&b->key)->v;
++
++			/* XXX endianness */
++			btree_err_on(bp->seq != bn->keys.seq,
++				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++				     "incorrect sequence number (wrong btree node)");
++		}
++
++		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
++			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
++			     "incorrect btree id");
++
++		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
++			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
++			     "incorrect level");
++
++		if (!write)
++			compat_btree_node(b->c.level, b->c.btree_id, version,
++					  BSET_BIG_ENDIAN(i), write, bn);
++
++		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
++			struct bch_btree_ptr_v2 *bp =
++				&bkey_i_to_btree_ptr_v2(&b->key)->v;
++
++			if (BTREE_PTR_RANGE_UPDATED(bp)) {
++				b->data->min_key = bp->min_key;
++				b->data->max_key = b->key.k.p;
++			}
++
++			btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
++				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++				     "incorrect min_key: got %s should be %s",
++				     (printbuf_reset(&buf1),
++				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
++				     (printbuf_reset(&buf2),
++				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
++		}
++
++		btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
++			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
++			     "incorrect max key %s",
++			     (printbuf_reset(&buf1),
++			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
++
++		if (write)
++			compat_btree_node(b->c.level, b->c.btree_id, version,
++					  BSET_BIG_ENDIAN(i), write, bn);
++
++		err = bch2_bkey_format_validate(&bn->format);
++		btree_err_on(err,
++			     BTREE_ERR_FATAL, c, ca, b, i,
++			     "invalid bkey format: %s", err);
++
++		compat_bformat(b->c.level, b->c.btree_id, version,
++			       BSET_BIG_ENDIAN(i), write,
++			       &bn->format);
++	}
++out:
++fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
++	return ret;
++}
++
++static int bset_key_invalid(struct bch_fs *c, struct btree *b,
++			    struct bkey_s_c k,
++			    bool updated_range, int rw,
++			    struct printbuf *err)
++{
++	return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
++		(!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
++		(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
++}
++
++static int validate_bset_keys(struct bch_fs *c, struct btree *b,
++			 struct bset *i, unsigned *whiteout_u64s,
++			 int write, bool have_retry)
++{
++	unsigned version = le16_to_cpu(i->version);
++	struct bkey_packed *k, *prev = NULL;
++	struct printbuf buf = PRINTBUF;
++	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
++		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
++	int ret = 0;
++
++	for (k = i->start;
++	     k != vstruct_last(i);) {
++		struct bkey_s u;
++		struct bkey tmp;
++
++		if (btree_err_on(bkey_next(k) > vstruct_last(i),
++				 BTREE_ERR_FIXABLE, c, NULL, b, i,
++				 "key extends past end of bset")) {
++			i->u64s = cpu_to_le16((u64 *) k - i->_data);
++			break;
++		}
++
++		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
++				 BTREE_ERR_FIXABLE, c, NULL, b, i,
++				 "invalid bkey format %u", k->format)) {
++			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
++			memmove_u64s_down(k, bkey_next(k),
++					  (u64 *) vstruct_end(i) - (u64 *) k);
++			continue;
++		}
++
++		/* XXX: validate k->u64s */
++		if (!write)
++			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
++				    BSET_BIG_ENDIAN(i), write,
++				    &b->format, k);
++
++		u = __bkey_disassemble(b, k, &tmp);
++
++		printbuf_reset(&buf);
++		if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
++			printbuf_reset(&buf);
++			prt_printf(&buf, "invalid bkey:  ");
++			bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
++			prt_printf(&buf, "\n  ");
++			bch2_bkey_val_to_text(&buf, c, u.s_c);
++
++			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
++
++			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
++			memmove_u64s_down(k, bkey_next(k),
++					  (u64 *) vstruct_end(i) - (u64 *) k);
++			continue;
++		}
++
++		if (write)
++			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
++				    BSET_BIG_ENDIAN(i), write,
++				    &b->format, k);
++
++		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
++			struct bkey up = bkey_unpack_key(b, prev);
++
++			printbuf_reset(&buf);
++			prt_printf(&buf, "keys out of order: ");
++			bch2_bkey_to_text(&buf, &up);
++			prt_printf(&buf, " > ");
++			bch2_bkey_to_text(&buf, u.k);
++
++			bch2_dump_bset(c, b, i, 0);
++
++			if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
++				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
++				memmove_u64s_down(k, bkey_next(k),
++						  (u64 *) vstruct_end(i) - (u64 *) k);
++				continue;
++			}
++		}
++
++		prev = k;
++		k = bkey_next(k);
++	}
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
++			      struct btree *b, bool have_retry)
++{
++	struct btree_node_entry *bne;
++	struct sort_iter *iter;
++	struct btree_node *sorted;
++	struct bkey_packed *k;
++	struct bch_extent_ptr *ptr;
++	struct bset *i;
++	bool used_mempool, blacklisted;
++	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
++		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
++	unsigned u64s;
++	unsigned blacklisted_written, nonblacklisted_written = 0;
++	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
++	struct printbuf buf = PRINTBUF;
++	int ret, retry_read = 0, write = READ;
++
++	b->version_ondisk = U16_MAX;
++	/* We might get called multiple times on read retry: */
++	b->written = 0;
++
++	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
++	sort_iter_init(iter, b);
++	iter->size = (btree_blocks(c) + 1) * 2;
++
++	if (bch2_meta_read_fault("btree"))
++		btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++			  "dynamic fault");
++
++	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
++		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++		     "bad magic: want %llx, got %llx",
++		     bset_magic(c), le64_to_cpu(b->data->magic));
++
++	btree_err_on(!b->data->keys.seq,
++		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++		     "bad btree header: seq 0");
++
++	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
++		struct bch_btree_ptr_v2 *bp =
++			&bkey_i_to_btree_ptr_v2(&b->key)->v;
++
++		btree_err_on(b->data->keys.seq != bp->seq,
++			     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++			     "got wrong btree node (seq %llx want %llx)",
++			     b->data->keys.seq, bp->seq);
++	}
++
++	while (b->written < (ptr_written ?: btree_sectors(c))) {
++		unsigned sectors, whiteout_u64s = 0;
++		struct nonce nonce;
++		struct bch_csum csum;
++		bool first = !b->written;
++
++		if (!b->written) {
++			i = &b->data->keys;
++
++			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
++				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++				     "unknown checksum type %llu",
++				     BSET_CSUM_TYPE(i));
++
++			nonce = btree_nonce(i, b->written << 9);
++			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
++
++			btree_err_on(bch2_crc_cmp(csum, b->data->csum),
++				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++				     "invalid checksum");
++
++			ret = bset_encrypt(c, i, b->written << 9);
++			if (bch2_fs_fatal_err_on(ret, c,
++					"error decrypting btree node: %i", ret))
++				goto fsck_err;
++
++			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
++				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
++				     BTREE_ERR_FATAL, c, NULL, b, NULL,
++				     "btree node does not have NEW_EXTENT_OVERWRITE set");
++
++			sectors = vstruct_sectors(b->data, c->block_bits);
++		} else {
++			bne = write_block(b);
++			i = &bne->keys;
++
++			if (i->seq != b->data->keys.seq)
++				break;
++
++			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
++				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++				     "unknown checksum type %llu",
++				     BSET_CSUM_TYPE(i));
++
++			nonce = btree_nonce(i, b->written << 9);
++			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
++
++			btree_err_on(bch2_crc_cmp(csum, bne->csum),
++				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++				     "invalid checksum");
++
++			ret = bset_encrypt(c, i, b->written << 9);
++			if (bch2_fs_fatal_err_on(ret, c,
++					"error decrypting btree node: %i\n", ret))
++				goto fsck_err;
++
++			sectors = vstruct_sectors(bne, c->block_bits);
++		}
++
++		b->version_ondisk = min(b->version_ondisk,
++					le16_to_cpu(i->version));
++
++		ret = validate_bset(c, ca, b, i, b->written, sectors,
++				    READ, have_retry);
++		if (ret)
++			goto fsck_err;
++
++		if (!b->written)
++			btree_node_set_format(b, b->data->format);
++
++		ret = validate_bset_keys(c, b, i, &whiteout_u64s,
++				    READ, have_retry);
++		if (ret)
++			goto fsck_err;
++
++		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
++
++		blacklisted = bch2_journal_seq_is_blacklisted(c,
++					le64_to_cpu(i->journal_seq),
++					true);
++
++		btree_err_on(blacklisted && first,
++			     BTREE_ERR_FIXABLE, c, ca, b, i,
++			     "first btree node bset has blacklisted journal seq (%llu)",
++			     le64_to_cpu(i->journal_seq));
++
++		btree_err_on(blacklisted && ptr_written,
++			     BTREE_ERR_FIXABLE, c, ca, b, i,
++			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
++			     le64_to_cpu(i->journal_seq),
++			     b->written, b->written + sectors, ptr_written);
++
++		b->written += sectors;
++
++		if (blacklisted && !first)
++			continue;
++
++		sort_iter_add(iter, i->start,
++			      vstruct_idx(i, whiteout_u64s));
++
++		sort_iter_add(iter,
++			      vstruct_idx(i, whiteout_u64s),
++			      vstruct_last(i));
++
++		nonblacklisted_written = b->written;
++	}
++
++	if (ptr_written) {
++		btree_err_on(b->written < ptr_written,
++			     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
++			     "btree node data missing: expected %u sectors, found %u",
++			     ptr_written, b->written);
++	} else {
++		for (bne = write_block(b);
++		     bset_byte_offset(b, bne) < btree_bytes(c);
++		     bne = (void *) bne + block_bytes(c))
++			btree_err_on(bne->keys.seq == b->data->keys.seq &&
++				     !bch2_journal_seq_is_blacklisted(c,
++								      le64_to_cpu(bne->keys.journal_seq),
++								      true),
++				     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
++				     "found bset signature after last bset");
++
++		/*
++		 * Blacklisted bsets are those that were written after the most recent
++		 * (flush) journal write. Since there wasn't a flush, they may not have
++		 * made it to all devices - which means we shouldn't write new bsets
++		 * after them, as that could leave a gap and then reads from that device
++		 * wouldn't find all the bsets in that btree node - which means it's
++		 * important that we start writing new bsets after the most recent _non_
++		 * blacklisted bset:
++		 */
++		blacklisted_written = b->written;
++		b->written = nonblacklisted_written;
++	}
++
++	sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
++	sorted->keys.u64s = 0;
++
++	set_btree_bset(b, b->set, &b->data->keys);
++
++	b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
++
++	u64s = le16_to_cpu(sorted->keys.u64s);
++	*sorted = *b->data;
++	sorted->keys.u64s = cpu_to_le16(u64s);
++	swap(sorted, b->data);
++	set_btree_bset(b, b->set, &b->data->keys);
++	b->nsets = 1;
++
++	BUG_ON(b->nr.live_u64s != u64s);
++
++	btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
++
++	if (updated_range)
++		bch2_btree_node_drop_keys_outside_node(b);
++
++	i = &b->data->keys;
++	for (k = i->start; k != vstruct_last(i);) {
++		struct bkey tmp;
++		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
++
++		printbuf_reset(&buf);
++
++		if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
++		    (bch2_inject_invalid_keys &&
++		     !bversion_cmp(u.k->version, MAX_VERSION))) {
++			printbuf_reset(&buf);
++
++			prt_printf(&buf, "invalid bkey: ");
++			bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
++			prt_printf(&buf, "\n  ");
++			bch2_bkey_val_to_text(&buf, c, u.s_c);
++
++			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
++
++			btree_keys_account_key_drop(&b->nr, 0, k);
++
++			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
++			memmove_u64s_down(k, bkey_next(k),
++					  (u64 *) vstruct_end(i) - (u64 *) k);
++			set_btree_bset_end(b, b->set);
++			continue;
++		}
++
++		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
++			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
++
++			bp.v->mem_ptr = 0;
++		}
++
++		k = bkey_next(k);
++	}
++
++	bch2_bset_build_aux_tree(b, b->set, false);
++
++	set_needs_whiteout(btree_bset_first(b), true);
++
++	btree_node_reset_sib_u64s(b);
++
++	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++
++		if (ca->mi.state != BCH_MEMBER_STATE_rw)
++			set_btree_node_need_rewrite(b);
++	}
++
++	if (!ptr_written)
++		set_btree_node_need_rewrite(b);
++out:
++	mempool_free(iter, &c->fill_iter);
++	printbuf_exit(&buf);
++	return retry_read;
++fsck_err:
++	if (ret == BTREE_RETRY_READ) {
++		retry_read = 1;
++	} else {
++		bch2_inconsistent_error(c);
++		set_btree_node_read_error(b);
++	}
++	goto out;
++}
++
++static void btree_node_read_work(struct work_struct *work)
++{
++	struct btree_read_bio *rb =
++		container_of(work, struct btree_read_bio, work);
++	struct bch_fs *c	= rb->c;
++	struct btree *b		= rb->b;
++	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
++	struct bio *bio		= &rb->bio;
++	struct bch_io_failures failed = { .nr = 0 };
++	struct printbuf buf = PRINTBUF;
++	bool saw_error = false;
++	bool retry = false;
++	bool can_retry;
++
++	goto start;
++	while (1) {
++		retry = true;
++		bch_info(c, "retrying read");
++		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
++		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
++		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
++		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
++		bio->bi_iter.bi_size	= btree_bytes(c);
++
++		if (rb->have_ioref) {
++			bio_set_dev(bio, ca->disk_sb.bdev);
++			submit_bio_wait(bio);
++		} else {
++			bio->bi_status = BLK_STS_REMOVED;
++		}
++start:
++		printbuf_reset(&buf);
++		btree_pos_to_text(&buf, c, b);
++		bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
++				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
++		if (rb->have_ioref)
++			percpu_ref_put(&ca->io_ref);
++		rb->have_ioref = false;
++
++		bch2_mark_io_failure(&failed, &rb->pick);
++
++		can_retry = bch2_bkey_pick_read_device(c,
++				bkey_i_to_s_c(&b->key),
++				&failed, &rb->pick) > 0;
++
++		if (!bio->bi_status &&
++		    !bch2_btree_node_read_done(c, ca, b, can_retry)) {
++			if (retry)
++				bch_info(c, "retry success");
++			break;
++		}
++
++		saw_error = true;
++
++		if (!can_retry) {
++			set_btree_node_read_error(b);
++			break;
++		}
++	}
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
++			       rb->start_time);
++	bio_put(&rb->bio);
++	printbuf_exit(&buf);
++
++	if (saw_error && !btree_node_read_error(b))
++		bch2_btree_node_rewrite_async(c, b);
++
++	clear_btree_node_read_in_flight(b);
++	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
++}
++
++static void btree_node_read_endio(struct bio *bio)
++{
++	struct btree_read_bio *rb =
++		container_of(bio, struct btree_read_bio, bio);
++	struct bch_fs *c	= rb->c;
++
++	if (rb->have_ioref) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
++		bch2_latency_acct(ca, rb->start_time, READ);
++	}
++
++	queue_work(c->io_complete_wq, &rb->work);
++}
++
++struct btree_node_read_all {
++	struct closure		cl;
++	struct bch_fs		*c;
++	struct btree		*b;
++	unsigned		nr;
++	void			*buf[BCH_REPLICAS_MAX];
++	struct bio		*bio[BCH_REPLICAS_MAX];
++	int			err[BCH_REPLICAS_MAX];
++};
++
++static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
++{
++	struct btree_node *bn = data;
++	struct btree_node_entry *bne;
++	unsigned offset = 0;
++
++	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
++		return 0;
++
++	while (offset < btree_sectors(c)) {
++		if (!offset) {
++			offset += vstruct_sectors(bn, c->block_bits);
++		} else {
++			bne = data + (offset << 9);
++			if (bne->keys.seq != bn->keys.seq)
++				break;
++			offset += vstruct_sectors(bne, c->block_bits);
++		}
++	}
++
++	return offset;
++}
++
++static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
++{
++	struct btree_node *bn = data;
++	struct btree_node_entry *bne;
++
++	if (!offset)
++		return false;
++
++	while (offset < btree_sectors(c)) {
++		bne = data + (offset << 9);
++		if (bne->keys.seq == bn->keys.seq)
++			return true;
++		offset++;
++	}
++
++	return false;
++	return offset;
++}
++
++static void btree_node_read_all_replicas_done(struct closure *cl)
++{
++	struct btree_node_read_all *ra =
++		container_of(cl, struct btree_node_read_all, cl);
++	struct bch_fs *c = ra->c;
++	struct btree *b = ra->b;
++	struct printbuf buf = PRINTBUF;
++	bool dump_bset_maps = false;
++	bool have_retry = false;
++	int ret = 0, best = -1, write = READ;
++	unsigned i, written = 0, written2 = 0;
++	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
++		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
++
++	for (i = 0; i < ra->nr; i++) {
++		struct btree_node *bn = ra->buf[i];
++
++		if (ra->err[i])
++			continue;
++
++		if (le64_to_cpu(bn->magic) != bset_magic(c) ||
++		    (seq && seq != bn->keys.seq))
++			continue;
++
++		if (best < 0) {
++			best = i;
++			written = btree_node_sectors_written(c, bn);
++			continue;
++		}
++
++		written2 = btree_node_sectors_written(c, ra->buf[i]);
++		if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
++				 "btree node sectors written mismatch: %u != %u",
++				 written, written2) ||
++		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
++				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
++				 "found bset signature after last bset") ||
++		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
++				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
++				 "btree node replicas content mismatch"))
++			dump_bset_maps = true;
++
++		if (written2 > written) {
++			written = written2;
++			best = i;
++		}
++	}
++fsck_err:
++	if (dump_bset_maps) {
++		for (i = 0; i < ra->nr; i++) {
++			struct btree_node *bn = ra->buf[i];
++			struct btree_node_entry *bne = NULL;
++			unsigned offset = 0, sectors;
++			bool gap = false;
++
++			if (ra->err[i])
++				continue;
++
++			printbuf_reset(&buf);
++
++			while (offset < btree_sectors(c)) {
++				if (!offset) {
++					sectors = vstruct_sectors(bn, c->block_bits);
++				} else {
++					bne = ra->buf[i] + (offset << 9);
++					if (bne->keys.seq != bn->keys.seq)
++						break;
++					sectors = vstruct_sectors(bne, c->block_bits);
++				}
++
++				prt_printf(&buf, " %u-%u", offset, offset + sectors);
++				if (bne && bch2_journal_seq_is_blacklisted(c,
++							le64_to_cpu(bne->keys.journal_seq), false))
++					prt_printf(&buf, "*");
++				offset += sectors;
++			}
++
++			while (offset < btree_sectors(c)) {
++				bne = ra->buf[i] + (offset << 9);
++				if (bne->keys.seq == bn->keys.seq) {
++					if (!gap)
++						prt_printf(&buf, " GAP");
++					gap = true;
++
++					sectors = vstruct_sectors(bne, c->block_bits);
++					prt_printf(&buf, " %u-%u", offset, offset + sectors);
++					if (bch2_journal_seq_is_blacklisted(c,
++							le64_to_cpu(bne->keys.journal_seq), false))
++						prt_printf(&buf, "*");
++				}
++				offset++;
++			}
++
++			bch_err(c, "replica %u:%s", i, buf.buf);
++		}
++	}
++
++	if (best >= 0) {
++		memcpy(b->data, ra->buf[best], btree_bytes(c));
++		ret = bch2_btree_node_read_done(c, NULL, b, false);
++	} else {
++		ret = -1;
++	}
++
++	if (ret)
++		set_btree_node_read_error(b);
++
++	for (i = 0; i < ra->nr; i++) {
++		mempool_free(ra->buf[i], &c->btree_bounce_pool);
++		bio_put(ra->bio[i]);
++	}
++
++	closure_debug_destroy(&ra->cl);
++	kfree(ra);
++	printbuf_exit(&buf);
++
++	clear_btree_node_read_in_flight(b);
++	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
++}
++
++static void btree_node_read_all_replicas_endio(struct bio *bio)
++{
++	struct btree_read_bio *rb =
++		container_of(bio, struct btree_read_bio, bio);
++	struct bch_fs *c	= rb->c;
++	struct btree_node_read_all *ra = rb->ra;
++
++	if (rb->have_ioref) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
++		bch2_latency_acct(ca, rb->start_time, READ);
++	}
++
++	ra->err[rb->idx] = bio->bi_status;
++	closure_put(&ra->cl);
++}
++
++/*
++ * XXX This allocates multiple times from the same mempools, and can deadlock
++ * under sufficient memory pressure (but is only a debug path)
++ */
++static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
++{
++	struct bkey_s_c k = bkey_i_to_s_c(&b->key);
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded pick;
++	struct btree_node_read_all *ra;
++	unsigned i;
++
++	ra = kzalloc(sizeof(*ra), GFP_NOFS);
++	if (!ra)
++		return -ENOMEM;
++
++	closure_init(&ra->cl, NULL);
++	ra->c	= c;
++	ra->b	= b;
++	ra->nr	= bch2_bkey_nr_ptrs(k);
++
++	for (i = 0; i < ra->nr; i++) {
++		ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
++		ra->bio[i] = bio_alloc_bioset(NULL,
++					      buf_pages(ra->buf[i], btree_bytes(c)),
++					      REQ_OP_READ|REQ_SYNC|REQ_META,
++					      GFP_NOFS,
++					      &c->btree_bio);
++	}
++
++	i = 0;
++	bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
++		struct btree_read_bio *rb =
++			container_of(ra->bio[i], struct btree_read_bio, bio);
++		rb->c			= c;
++		rb->b			= b;
++		rb->ra			= ra;
++		rb->start_time		= local_clock();
++		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
++		rb->idx			= i;
++		rb->pick		= pick;
++		rb->bio.bi_iter.bi_sector = pick.ptr.offset;
++		rb->bio.bi_end_io	= btree_node_read_all_replicas_endio;
++		bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
++
++		if (rb->have_ioref) {
++			this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
++				     bio_sectors(&rb->bio));
++			bio_set_dev(&rb->bio, ca->disk_sb.bdev);
++
++			closure_get(&ra->cl);
++			submit_bio(&rb->bio);
++		} else {
++			ra->err[i] = BLK_STS_REMOVED;
++		}
++
++		i++;
++	}
++
++	if (sync) {
++		closure_sync(&ra->cl);
++		btree_node_read_all_replicas_done(&ra->cl);
++	} else {
++		continue_at(&ra->cl, btree_node_read_all_replicas_done,
++			    c->io_complete_wq);
++	}
++
++	return 0;
++}
++
++void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
++			  bool sync)
++{
++	struct extent_ptr_decoded pick;
++	struct btree_read_bio *rb;
++	struct bch_dev *ca;
++	struct bio *bio;
++	int ret;
++
++	trace_btree_read(c, b);
++
++	if (bch2_verify_all_btree_replicas &&
++	    !btree_node_read_all_replicas(c, b, sync))
++		return;
++
++	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
++					 NULL, &pick);
++
++	if (ret <= 0) {
++		struct printbuf buf = PRINTBUF;
++
++		prt_str(&buf, "btree node read error: no device to read from\n at ");
++		btree_pos_to_text(&buf, c, b);
++		bch_err(c, "%s", buf.buf);
++
++		if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
++			bch2_fatal_error(c);
++
++		set_btree_node_read_error(b);
++		clear_btree_node_read_in_flight(b);
++		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
++		printbuf_exit(&buf);
++		return;
++	}
++
++	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
++
++	bio = bio_alloc_bioset(NULL,
++			       buf_pages(b->data, btree_bytes(c)),
++			       REQ_OP_READ|REQ_SYNC|REQ_META,
++			       GFP_NOIO,
++			       &c->btree_bio);
++	rb = container_of(bio, struct btree_read_bio, bio);
++	rb->c			= c;
++	rb->b			= b;
++	rb->ra			= NULL;
++	rb->start_time		= local_clock();
++	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
++	rb->pick		= pick;
++	INIT_WORK(&rb->work, btree_node_read_work);
++	bio->bi_iter.bi_sector	= pick.ptr.offset;
++	bio->bi_end_io		= btree_node_read_endio;
++	bch2_bio_map(bio, b->data, btree_bytes(c));
++
++	if (rb->have_ioref) {
++		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
++			     bio_sectors(bio));
++		bio_set_dev(bio, ca->disk_sb.bdev);
++
++		if (sync) {
++			submit_bio_wait(bio);
++
++			btree_node_read_work(&rb->work);
++		} else {
++			submit_bio(bio);
++		}
++	} else {
++		bio->bi_status = BLK_STS_REMOVED;
++
++		if (sync)
++			btree_node_read_work(&rb->work);
++		else
++			queue_work(c->io_complete_wq, &rb->work);
++	}
++}
++
++int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
++			const struct bkey_i *k, unsigned level)
++{
++	struct closure cl;
++	struct btree *b;
++	int ret;
++
++	closure_init_stack(&cl);
++
++	do {
++		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
++		closure_sync(&cl);
++	} while (ret);
++
++	b = bch2_btree_node_mem_alloc(c, level != 0);
++	bch2_btree_cache_cannibalize_unlock(c);
++
++	BUG_ON(IS_ERR(b));
++
++	bkey_copy(&b->key, k);
++	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
++
++	set_btree_node_read_in_flight(b);
++
++	bch2_btree_node_read(c, b, true);
++
++	if (btree_node_read_error(b)) {
++		bch2_btree_node_hash_remove(&c->btree_cache, b);
++
++		mutex_lock(&c->btree_cache.lock);
++		list_move(&b->list, &c->btree_cache.freeable);
++		mutex_unlock(&c->btree_cache.lock);
++
++		ret = -EIO;
++		goto err;
++	}
++
++	bch2_btree_set_root_for_read(c, b);
++err:
++	six_unlock_write(&b->c.lock);
++	six_unlock_intent(&b->c.lock);
++
++	return ret;
++}
++
++void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
++			      struct btree_write *w)
++{
++	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
++
++	do {
++		old = new = v;
++		if (!(old & 1))
++			break;
++
++		new &= ~1UL;
++	} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
++
++	if (old & 1)
++		closure_put(&((struct btree_update *) new)->cl);
++
++	bch2_journal_pin_drop(&c->journal, &w->journal);
++}
++
++static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
++{
++	struct btree_write *w = btree_prev_write(b);
++	unsigned long old, new, v;
++
++	bch2_btree_complete_write(c, b, w);
++
++	v = READ_ONCE(b->flags);
++	do {
++		old = new = v;
++
++		if ((old & (1U << BTREE_NODE_dirty)) &&
++		    (old & (1U << BTREE_NODE_need_write)) &&
++		    !(old & (1U << BTREE_NODE_never_write)) &&
++		    !(old & (1U << BTREE_NODE_write_blocked)) &&
++		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
++			new &= ~(1U << BTREE_NODE_dirty);
++			new &= ~(1U << BTREE_NODE_need_write);
++			new |=  (1U << BTREE_NODE_write_in_flight);
++			new |=  (1U << BTREE_NODE_write_in_flight_inner);
++			new |=  (1U << BTREE_NODE_just_written);
++			new ^=  (1U << BTREE_NODE_write_idx);
++		} else {
++			new &= ~(1U << BTREE_NODE_write_in_flight);
++			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
++		}
++	} while ((v = cmpxchg(&b->flags, old, new)) != old);
++
++	if (new & (1U << BTREE_NODE_write_in_flight))
++		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
++	else
++		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
++}
++
++static void btree_node_write_done(struct bch_fs *c, struct btree *b)
++{
++	six_lock_read(&b->c.lock, NULL, NULL);
++	__btree_node_write_done(c, b);
++	six_unlock_read(&b->c.lock);
++}
++
++static void btree_node_write_work(struct work_struct *work)
++{
++	struct btree_write_bio *wbio =
++		container_of(work, struct btree_write_bio, work);
++	struct bch_fs *c	= wbio->wbio.c;
++	struct btree *b		= wbio->wbio.bio.bi_private;
++	struct bch_extent_ptr *ptr;
++	int ret;
++
++	btree_bounce_free(c,
++		wbio->data_bytes,
++		wbio->wbio.used_mempool,
++		wbio->data);
++
++	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
++		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
++
++	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
++		goto err;
++
++	if (wbio->wbio.first_btree_write) {
++		if (wbio->wbio.failed.nr) {
++
++		}
++	} else {
++		ret = bch2_trans_do(c, NULL, NULL, 0,
++			bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
++							    !wbio->wbio.failed.nr));
++		if (ret)
++			goto err;
++	}
++out:
++	bio_put(&wbio->wbio.bio);
++	btree_node_write_done(c, b);
++	return;
++err:
++	set_btree_node_noevict(b);
++	bch2_fs_fatal_error(c, "fatal error writing btree node");
++	goto out;
++}
++
++static void btree_node_write_endio(struct bio *bio)
++{
++	struct bch_write_bio *wbio	= to_wbio(bio);
++	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
++	struct bch_write_bio *orig	= parent ?: wbio;
++	struct btree_write_bio *wb	= container_of(orig, struct btree_write_bio, wbio);
++	struct bch_fs *c		= wbio->c;
++	struct btree *b			= wbio->bio.bi_private;
++	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
++	unsigned long flags;
++
++	if (wbio->have_ioref)
++		bch2_latency_acct(ca, wbio->submit_time, WRITE);
++
++	if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
++			       bch2_blk_status_to_str(bio->bi_status)) ||
++	    bch2_meta_write_fault("btree")) {
++		spin_lock_irqsave(&c->btree_write_error_lock, flags);
++		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
++		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
++	}
++
++	if (wbio->have_ioref)
++		percpu_ref_put(&ca->io_ref);
++
++	if (parent) {
++		bio_put(bio);
++		bio_endio(&parent->bio);
++		return;
++	}
++
++	clear_btree_node_write_in_flight_inner(b);
++	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
++	INIT_WORK(&wb->work, btree_node_write_work);
++	queue_work(c->btree_io_complete_wq, &wb->work);
++}
++
++static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
++				   struct bset *i, unsigned sectors)
++{
++	unsigned whiteout_u64s = 0;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
++				BKEY_TYPE_btree, WRITE, &buf);
++
++	if (ret)
++		bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
++	printbuf_exit(&buf);
++	if (ret)
++		return ret;
++
++	ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
++		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
++	if (ret) {
++		bch2_inconsistent_error(c);
++		dump_stack();
++	}
++
++	return ret;
++}
++
++static void btree_write_submit(struct work_struct *work)
++{
++	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
++	struct bch_extent_ptr *ptr;
++	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
++
++	bkey_copy(&tmp.k, &wbio->key);
++
++	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
++		ptr->offset += wbio->sector_offset;
++
++	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
++}
++
++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
++{
++	struct btree_write_bio *wbio;
++	struct bset_tree *t;
++	struct bset *i;
++	struct btree_node *bn = NULL;
++	struct btree_node_entry *bne = NULL;
++	struct sort_iter sort_iter;
++	struct nonce nonce;
++	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
++	u64 seq = 0;
++	bool used_mempool;
++	unsigned long old, new;
++	bool validate_before_checksum = false;
++	void *data;
++	int ret;
++
++	if (flags & BTREE_WRITE_ALREADY_STARTED)
++		goto do_write;
++
++	/*
++	 * We may only have a read lock on the btree node - the dirty bit is our
++	 * "lock" against racing with other threads that may be trying to start
++	 * a write, we do a write iff we clear the dirty bit. Since setting the
++	 * dirty bit requires a write lock, we can't race with other threads
++	 * redirtying it:
++	 */
++	do {
++		old = new = READ_ONCE(b->flags);
++
++		if (!(old & (1 << BTREE_NODE_dirty)))
++			return;
++
++		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
++		    !(old & (1 << BTREE_NODE_need_write)))
++			return;
++
++		if (old &
++		    ((1 << BTREE_NODE_never_write)|
++		     (1 << BTREE_NODE_write_blocked)))
++			return;
++
++		if (b->written &&
++		    (old & (1 << BTREE_NODE_will_make_reachable)))
++			return;
++
++		if (old & (1 << BTREE_NODE_write_in_flight))
++			return;
++
++		new &= ~(1 << BTREE_NODE_dirty);
++		new &= ~(1 << BTREE_NODE_need_write);
++		new |=  (1 << BTREE_NODE_write_in_flight);
++		new |=  (1 << BTREE_NODE_write_in_flight_inner);
++		new |=  (1 << BTREE_NODE_just_written);
++		new ^=  (1 << BTREE_NODE_write_idx);
++	} while (cmpxchg_acquire(&b->flags, old, new) != old);
++
++	if (new & (1U << BTREE_NODE_need_write))
++		return;
++do_write:
++	atomic_dec(&c->btree_cache.dirty);
++
++	BUG_ON(btree_node_fake(b));
++	BUG_ON((b->will_make_reachable != 0) != !b->written);
++
++	BUG_ON(b->written >= btree_sectors(c));
++	BUG_ON(b->written & (block_sectors(c) - 1));
++	BUG_ON(bset_written(b, btree_bset_last(b)));
++	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
++	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
++
++	bch2_sort_whiteouts(c, b);
++
++	sort_iter_init(&sort_iter, b);
++
++	bytes = !b->written
++		? sizeof(struct btree_node)
++		: sizeof(struct btree_node_entry);
++
++	bytes += b->whiteout_u64s * sizeof(u64);
++
++	for_each_bset(b, t) {
++		i = bset(b, t);
++
++		if (bset_written(b, i))
++			continue;
++
++		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
++		sort_iter_add(&sort_iter,
++			      btree_bkey_first(b, t),
++			      btree_bkey_last(b, t));
++		seq = max(seq, le64_to_cpu(i->journal_seq));
++	}
++
++	BUG_ON(b->written && !seq);
++
++	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
++	bytes += 8;
++
++	/* buffer must be a multiple of the block size */
++	bytes = round_up(bytes, block_bytes(c));
++
++	data = btree_bounce_alloc(c, bytes, &used_mempool);
++
++	if (!b->written) {
++		bn = data;
++		*bn = *b->data;
++		i = &bn->keys;
++	} else {
++		bne = data;
++		bne->keys = b->data->keys;
++		i = &bne->keys;
++	}
++
++	i->journal_seq	= cpu_to_le64(seq);
++	i->u64s		= 0;
++
++	sort_iter_add(&sort_iter,
++		      unwritten_whiteouts_start(c, b),
++		      unwritten_whiteouts_end(c, b));
++	SET_BSET_SEPARATE_WHITEOUTS(i, false);
++
++	b->whiteout_u64s = 0;
++
++	u64s = bch2_sort_keys(i->start, &sort_iter, false);
++	le16_add_cpu(&i->u64s, u64s);
++
++	set_needs_whiteout(i, false);
++
++	/* do we have data to write? */
++	if (b->written && !i->u64s)
++		goto nowrite;
++
++	bytes_to_write = vstruct_end(i) - data;
++	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
++
++	memset(data + bytes_to_write, 0,
++	       (sectors_to_write << 9) - bytes_to_write);
++
++	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
++	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
++	BUG_ON(i->seq != b->data->keys.seq);
++
++	i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
++		? cpu_to_le16(BCH_BSET_VERSION_OLD)
++		: cpu_to_le16(c->sb.version);
++	SET_BSET_OFFSET(i, b->written);
++	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
++
++	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
++		validate_before_checksum = true;
++
++	/* validate_bset will be modifying: */
++	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
++		validate_before_checksum = true;
++
++	/* if we're going to be encrypting, check metadata validity first: */
++	if (validate_before_checksum &&
++	    validate_bset_for_write(c, b, i, sectors_to_write))
++		goto err;
++
++	ret = bset_encrypt(c, i, b->written << 9);
++	if (bch2_fs_fatal_err_on(ret, c,
++			"error encrypting btree node: %i\n", ret))
++		goto err;
++
++	nonce = btree_nonce(i, b->written << 9);
++
++	if (bn)
++		bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
++	else
++		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
++
++	/* if we're not encrypting, check metadata after checksumming: */
++	if (!validate_before_checksum &&
++	    validate_bset_for_write(c, b, i, sectors_to_write))
++		goto err;
++
++	/*
++	 * We handle btree write errors by immediately halting the journal -
++	 * after we've done that, we can't issue any subsequent btree writes
++	 * because they might have pointers to new nodes that failed to write.
++	 *
++	 * Furthermore, there's no point in doing any more btree writes because
++	 * with the journal stopped, we're never going to update the journal to
++	 * reflect that those writes were done and the data flushed from the
++	 * journal:
++	 *
++	 * Also on journal error, the pending write may have updates that were
++	 * never journalled (interior nodes, see btree_update_nodes_written()) -
++	 * it's critical that we don't do the write in that case otherwise we
++	 * will have updates visible that weren't in the journal:
++	 *
++	 * Make sure to update b->written so bch2_btree_init_next() doesn't
++	 * break:
++	 */
++	if (bch2_journal_error(&c->journal) ||
++	    c->opts.nochanges)
++		goto err;
++
++	trace_btree_write(b, bytes_to_write, sectors_to_write);
++
++	wbio = container_of(bio_alloc_bioset(NULL,
++				buf_pages(data, sectors_to_write << 9),
++				REQ_OP_WRITE|REQ_META,
++				GFP_NOIO,
++				&c->btree_bio),
++			    struct btree_write_bio, wbio.bio);
++	wbio_init(&wbio->wbio.bio);
++	wbio->data			= data;
++	wbio->data_bytes		= bytes;
++	wbio->sector_offset		= b->written;
++	wbio->wbio.c			= c;
++	wbio->wbio.used_mempool		= used_mempool;
++	wbio->wbio.first_btree_write	= !b->written;
++	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
++	wbio->wbio.bio.bi_private	= b;
++
++	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
++
++	bkey_copy(&wbio->key, &b->key);
++
++	b->written += sectors_to_write;
++
++	if (wbio->wbio.first_btree_write &&
++	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
++		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
++			cpu_to_le16(b->written);
++
++	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
++		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
++			cpu_to_le16(b->written);
++
++	atomic64_inc(&c->btree_writes_nr);
++	atomic64_add(sectors_to_write, &c->btree_writes_sectors);
++
++	INIT_WORK(&wbio->work, btree_write_submit);
++	queue_work(c->io_complete_wq, &wbio->work);
++	return;
++err:
++	set_btree_node_noevict(b);
++	if (!b->written &&
++	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
++		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
++			cpu_to_le16(sectors_to_write);
++	b->written += sectors_to_write;
++nowrite:
++	btree_bounce_free(c, bytes, used_mempool, data);
++	__btree_node_write_done(c, b);
++}
++
++/*
++ * Work that must be done with write lock held:
++ */
++bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
++{
++	bool invalidated_iter = false;
++	struct btree_node_entry *bne;
++	struct bset_tree *t;
++
++	if (!btree_node_just_written(b))
++		return false;
++
++	BUG_ON(b->whiteout_u64s);
++
++	clear_btree_node_just_written(b);
++
++	/*
++	 * Note: immediately after write, bset_written() doesn't work - the
++	 * amount of data we had to write after compaction might have been
++	 * smaller than the offset of the last bset.
++	 *
++	 * However, we know that all bsets have been written here, as long as
++	 * we're still holding the write lock:
++	 */
++
++	/*
++	 * XXX: decide if we really want to unconditionally sort down to a
++	 * single bset:
++	 */
++	if (b->nsets > 1) {
++		btree_node_sort(c, b, 0, b->nsets, true);
++		invalidated_iter = true;
++	} else {
++		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
++	}
++
++	for_each_bset(b, t)
++		set_needs_whiteout(bset(b, t), true);
++
++	bch2_btree_verify(c, b);
++
++	/*
++	 * If later we don't unconditionally sort down to a single bset, we have
++	 * to ensure this is still true:
++	 */
++	BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
++
++	bne = want_new_bset(c, b);
++	if (bne)
++		bch2_bset_init_next(c, b, bne);
++
++	bch2_btree_build_aux_trees(b);
++
++	return invalidated_iter;
++}
++
++/*
++ * Use this one if the node is intent locked:
++ */
++void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
++			   enum six_lock_type lock_type_held,
++			   unsigned flags)
++{
++	if (lock_type_held == SIX_LOCK_intent ||
++	    (lock_type_held == SIX_LOCK_read &&
++	     six_lock_tryupgrade(&b->c.lock))) {
++		__bch2_btree_node_write(c, b, flags);
++
++		/* don't cycle lock unnecessarily: */
++		if (btree_node_just_written(b) &&
++		    six_trylock_write(&b->c.lock)) {
++			bch2_btree_post_write_cleanup(c, b);
++			six_unlock_write(&b->c.lock);
++		}
++
++		if (lock_type_held == SIX_LOCK_read)
++			six_lock_downgrade(&b->c.lock);
++	} else {
++		__bch2_btree_node_write(c, b, flags);
++		if (lock_type_held == SIX_LOCK_write &&
++		    btree_node_just_written(b))
++			bch2_btree_post_write_cleanup(c, b);
++	}
++}
++
++static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
++{
++	struct bucket_table *tbl;
++	struct rhash_head *pos;
++	struct btree *b;
++	unsigned i;
++	bool ret = false;
++restart:
++	rcu_read_lock();
++	for_each_cached_btree(b, c, tbl, i, pos)
++		if (test_bit(flag, &b->flags)) {
++			rcu_read_unlock();
++			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
++			ret = true;
++			goto restart;
++		}
++	rcu_read_unlock();
++
++	return ret;
++}
++
++bool bch2_btree_flush_all_reads(struct bch_fs *c)
++{
++	return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
++}
++
++bool bch2_btree_flush_all_writes(struct bch_fs *c)
++{
++	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
++}
+diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
+new file mode 100644
+index 000000000000..8af853642123
+--- /dev/null
++++ b/fs/bcachefs/btree_io.h
+@@ -0,0 +1,222 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BTREE_IO_H
++#define _BCACHEFS_BTREE_IO_H
++
++#include "bkey_methods.h"
++#include "bset.h"
++#include "btree_locking.h"
++#include "checksum.h"
++#include "extents.h"
++#include "io_types.h"
++
++struct bch_fs;
++struct btree_write;
++struct btree;
++struct btree_iter;
++struct btree_node_read_all;
++
++static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
++{
++	if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
++		atomic_inc(&c->btree_cache.dirty);
++}
++
++static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
++{
++	if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
++		atomic_dec(&c->btree_cache.dirty);
++}
++
++static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
++{
++	return k->k.type == KEY_TYPE_btree_ptr_v2
++		? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
++		: 0;
++}
++
++struct btree_read_bio {
++	struct bch_fs		*c;
++	struct btree		*b;
++	struct btree_node_read_all *ra;
++	u64			start_time;
++	unsigned		have_ioref:1;
++	unsigned		idx:7;
++	struct extent_ptr_decoded	pick;
++	struct work_struct	work;
++	struct bio		bio;
++};
++
++struct btree_write_bio {
++	struct work_struct	work;
++	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
++	void			*data;
++	unsigned		data_bytes;
++	unsigned		sector_offset;
++	struct bch_write_bio	wbio;
++};
++
++void bch2_btree_node_io_unlock(struct btree *);
++void bch2_btree_node_io_lock(struct btree *);
++void __bch2_btree_node_wait_on_read(struct btree *);
++void __bch2_btree_node_wait_on_write(struct btree *);
++void bch2_btree_node_wait_on_read(struct btree *);
++void bch2_btree_node_wait_on_write(struct btree *);
++
++enum compact_mode {
++	COMPACT_LAZY,
++	COMPACT_ALL,
++};
++
++bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
++			    enum compact_mode);
++
++static inline bool should_compact_bset_lazy(struct btree *b,
++					    struct bset_tree *t)
++{
++	unsigned total_u64s = bset_u64s(t);
++	unsigned dead_u64s = bset_dead_u64s(b, t);
++
++	return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
++}
++
++static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
++{
++	struct bset_tree *t;
++
++	for_each_bset(b, t)
++		if (should_compact_bset_lazy(b, t))
++			return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
++
++	return false;
++}
++
++static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
++{
++	return (struct nonce) {{
++		[0] = cpu_to_le32(offset),
++		[1] = ((__le32 *) &i->seq)[0],
++		[2] = ((__le32 *) &i->seq)[1],
++		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
++	}};
++}
++
++static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
++{
++	struct nonce nonce = btree_nonce(i, offset);
++	int ret;
++
++	if (!offset) {
++		struct btree_node *bn = container_of(i, struct btree_node, keys);
++		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
++
++		ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
++				   &bn->flags, bytes);
++		if (ret)
++			return ret;
++
++		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
++	}
++
++	return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
++			    vstruct_end(i) - (void *) i->_data);
++}
++
++void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
++
++void bch2_btree_node_drop_keys_outside_node(struct btree *);
++
++void bch2_btree_build_aux_trees(struct btree *);
++void bch2_btree_init_next(struct btree_trans *, struct btree *);
++
++int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
++			      struct btree *, bool);
++void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
++int bch2_btree_root_read(struct bch_fs *, enum btree_id,
++			 const struct bkey_i *, unsigned);
++
++void bch2_btree_complete_write(struct bch_fs *, struct btree *,
++			      struct btree_write *);
++
++bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
++
++#define BTREE_WRITE_ONLY_IF_NEED	(1U << 0)
++#define BTREE_WRITE_ALREADY_STARTED	(1U << 1)
++
++void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
++void bch2_btree_node_write(struct bch_fs *, struct btree *,
++			   enum six_lock_type, unsigned);
++
++static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
++					    enum six_lock_type lock_held)
++{
++	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
++}
++
++bool bch2_btree_flush_all_reads(struct bch_fs *);
++bool bch2_btree_flush_all_writes(struct bch_fs *);
++
++static inline void compat_bformat(unsigned level, enum btree_id btree_id,
++				  unsigned version, unsigned big_endian,
++				  int write, struct bkey_format *f)
++{
++	if (version < bcachefs_metadata_version_inode_btree_change &&
++	    btree_id == BTREE_ID_inodes) {
++		swap(f->bits_per_field[BKEY_FIELD_INODE],
++		     f->bits_per_field[BKEY_FIELD_OFFSET]);
++		swap(f->field_offset[BKEY_FIELD_INODE],
++		     f->field_offset[BKEY_FIELD_OFFSET]);
++	}
++
++	if (version < bcachefs_metadata_version_snapshot &&
++	    (level || btree_type_has_snapshots(btree_id))) {
++		u64 max_packed =
++			~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
++
++		f->field_offset[BKEY_FIELD_SNAPSHOT] = write
++			? 0
++			: U32_MAX - max_packed;
++	}
++}
++
++static inline void compat_bpos(unsigned level, enum btree_id btree_id,
++			       unsigned version, unsigned big_endian,
++			       int write, struct bpos *p)
++{
++	if (big_endian != CPU_BIG_ENDIAN)
++		bch2_bpos_swab(p);
++
++	if (version < bcachefs_metadata_version_inode_btree_change &&
++	    btree_id == BTREE_ID_inodes)
++		swap(p->inode, p->offset);
++}
++
++static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
++				     unsigned version, unsigned big_endian,
++				     int write,
++				     struct btree_node *bn)
++{
++	if (version < bcachefs_metadata_version_inode_btree_change &&
++	    btree_node_type_is_extents(btree_id) &&
++	    bpos_cmp(bn->min_key, POS_MIN) &&
++	    write)
++		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
++
++	if (version < bcachefs_metadata_version_snapshot &&
++	    write)
++		bn->max_key.snapshot = 0;
++
++	compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
++	compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
++
++	if (version < bcachefs_metadata_version_snapshot &&
++	    !write)
++		bn->max_key.snapshot = U32_MAX;
++
++	if (version < bcachefs_metadata_version_inode_btree_change &&
++	    btree_node_type_is_extents(btree_id) &&
++	    bpos_cmp(bn->min_key, POS_MIN) &&
++	    !write)
++		bn->min_key = bpos_nosnap_successor(bn->min_key);
++}
++
++#endif /* _BCACHEFS_BTREE_IO_H */
+diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
+new file mode 100644
+index 000000000000..a90a45939aa3
+--- /dev/null
++++ b/fs/bcachefs/btree_iter.c
+@@ -0,0 +1,3515 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "bkey_methods.h"
++#include "bkey_buf.h"
++#include "btree_cache.h"
++#include "btree_iter.h"
++#include "btree_key_cache.h"
++#include "btree_locking.h"
++#include "btree_update.h"
++#include "debug.h"
++#include "error.h"
++#include "extents.h"
++#include "journal.h"
++#include "recovery.h"
++#include "replicas.h"
++#include "subvolume.h"
++
++#include <linux/prandom.h>
++#include <linux/prefetch.h>
++#include <trace/events/bcachefs.h>
++
++static void btree_trans_verify_sorted(struct btree_trans *);
++inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
++
++static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
++static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
++				       struct btree_path *);
++
++static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	return iter->ip_allocated;
++#else
++	return 0;
++#endif
++}
++
++static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
++
++/*
++ * Unlocks before scheduling
++ * Note: does not revalidate iterator
++ */
++static inline int bch2_trans_cond_resched(struct btree_trans *trans)
++{
++	if (need_resched() || race_fault()) {
++		bch2_trans_unlock(trans);
++		schedule();
++		return bch2_trans_relock(trans);
++	} else {
++		return 0;
++	}
++}
++
++static inline int __btree_path_cmp(const struct btree_path *l,
++				   enum btree_id	r_btree_id,
++				   bool			r_cached,
++				   struct bpos		r_pos,
++				   unsigned		r_level)
++{
++	/*
++	 * Must match lock ordering as defined by __bch2_btree_node_lock:
++	 */
++	return   cmp_int(l->btree_id,	r_btree_id) ?:
++		 cmp_int((int) l->cached,	(int) r_cached) ?:
++		 bpos_cmp(l->pos,	r_pos) ?:
++		-cmp_int(l->level,	r_level);
++}
++
++static inline int btree_path_cmp(const struct btree_path *l,
++				 const struct btree_path *r)
++{
++	return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
++}
++
++static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
++{
++	/* Are we iterating over keys in all snapshots? */
++	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
++		p = bpos_successor(p);
++	} else {
++		p = bpos_nosnap_successor(p);
++		p.snapshot = iter->snapshot;
++	}
++
++	return p;
++}
++
++static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
++{
++	/* Are we iterating over keys in all snapshots? */
++	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
++		p = bpos_predecessor(p);
++	} else {
++		p = bpos_nosnap_predecessor(p);
++		p.snapshot = iter->snapshot;
++	}
++
++	return p;
++}
++
++static inline bool is_btree_node(struct btree_path *path, unsigned l)
++{
++	return l < BTREE_MAX_DEPTH &&
++		(unsigned long) path->l[l].b >= 128;
++}
++
++static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
++{
++	struct bpos pos = iter->pos;
++
++	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
++	    bkey_cmp(pos, POS_MAX))
++		pos = bkey_successor(iter, pos);
++	return pos;
++}
++
++static inline bool btree_path_pos_before_node(struct btree_path *path,
++					      struct btree *b)
++{
++	return bpos_cmp(path->pos, b->data->min_key) < 0;
++}
++
++static inline bool btree_path_pos_after_node(struct btree_path *path,
++					     struct btree *b)
++{
++	return bpos_cmp(b->key.k.p, path->pos) < 0;
++}
++
++static inline bool btree_path_pos_in_node(struct btree_path *path,
++					  struct btree *b)
++{
++	return path->btree_id == b->c.btree_id &&
++		!btree_path_pos_before_node(path, b) &&
++		!btree_path_pos_after_node(path, b);
++}
++
++/* Btree node locking: */
++
++void bch2_btree_node_unlock_write(struct btree_trans *trans,
++			struct btree_path *path, struct btree *b)
++{
++	bch2_btree_node_unlock_write_inlined(trans, path, b);
++}
++
++void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
++{
++	struct btree_path *linked;
++	unsigned readers = 0;
++
++	trans_for_each_path(trans, linked)
++		if (linked->l[b->c.level].b == b &&
++		    btree_node_read_locked(linked, b->c.level))
++			readers++;
++
++	/*
++	 * Must drop our read locks before calling six_lock_write() -
++	 * six_unlock() won't do wakeups until the reader count
++	 * goes to 0, and it's safe because we have the node intent
++	 * locked:
++	 */
++	if (!b->c.lock.readers)
++		atomic64_sub(__SIX_VAL(read_lock, readers),
++			     &b->c.lock.state.counter);
++	else
++		this_cpu_sub(*b->c.lock.readers, readers);
++
++	six_lock_write(&b->c.lock, NULL, NULL);
++
++	if (!b->c.lock.readers)
++		atomic64_add(__SIX_VAL(read_lock, readers),
++			     &b->c.lock.state.counter);
++	else
++		this_cpu_add(*b->c.lock.readers, readers);
++}
++
++bool __bch2_btree_node_relock(struct btree_trans *trans,
++			      struct btree_path *path, unsigned level)
++{
++	struct btree *b = btree_path_node(path, level);
++	int want = __btree_lock_want(path, level);
++
++	if (!is_btree_node(path, level))
++		goto fail;
++
++	if (race_fault())
++		goto fail;
++
++	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
++	    (btree_node_lock_seq_matches(path, b, level) &&
++	     btree_node_lock_increment(trans, b, level, want))) {
++		mark_btree_node_locked(trans, path, level, want);
++		return true;
++	}
++fail:
++	if (b != BTREE_ITER_NO_NODE_CACHED &&
++	    b != BTREE_ITER_NO_NODE_INIT)
++		trace_btree_node_relock_fail(trans->fn, _RET_IP_,
++					     path->btree_id,
++					     &path->pos,
++					     (unsigned long) b,
++					     path->l[level].lock_seq,
++					     is_btree_node(path, level) ? b->c.lock.state.seq : 0);
++	return false;
++}
++
++bool bch2_btree_node_upgrade(struct btree_trans *trans,
++			     struct btree_path *path, unsigned level)
++{
++	struct btree *b = path->l[level].b;
++
++	if (!is_btree_node(path, level))
++		return false;
++
++	switch (btree_lock_want(path, level)) {
++	case BTREE_NODE_UNLOCKED:
++		BUG_ON(btree_node_locked(path, level));
++		return true;
++	case BTREE_NODE_READ_LOCKED:
++		BUG_ON(btree_node_intent_locked(path, level));
++		return bch2_btree_node_relock(trans, path, level);
++	case BTREE_NODE_INTENT_LOCKED:
++		break;
++	}
++
++	if (btree_node_intent_locked(path, level))
++		return true;
++
++	if (race_fault())
++		return false;
++
++	if (btree_node_locked(path, level)
++	    ? six_lock_tryupgrade(&b->c.lock)
++	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
++		goto success;
++
++	if (btree_node_lock_seq_matches(path, b, level) &&
++	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
++		btree_node_unlock(trans, path, level);
++		goto success;
++	}
++
++	return false;
++success:
++	mark_btree_node_intent_locked(trans, path, level);
++	return true;
++}
++
++static inline bool btree_path_get_locks(struct btree_trans *trans,
++					struct btree_path *path,
++					bool upgrade)
++{
++	unsigned l = path->level;
++	int fail_idx = -1;
++
++	do {
++		if (!btree_path_node(path, l))
++			break;
++
++		if (!(upgrade
++		      ? bch2_btree_node_upgrade(trans, path, l)
++		      : bch2_btree_node_relock(trans, path, l)))
++			fail_idx = l;
++
++		l++;
++	} while (l < path->locks_want);
++
++	/*
++	 * When we fail to get a lock, we have to ensure that any child nodes
++	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
++	 * the node that we failed to relock:
++	 */
++	if (fail_idx >= 0) {
++		__bch2_btree_path_unlock(trans, path);
++		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
++
++		do {
++			path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
++			--fail_idx;
++		} while (fail_idx >= 0);
++	}
++
++	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
++		path->uptodate = BTREE_ITER_UPTODATE;
++
++	bch2_trans_verify_locks(trans);
++
++	return path->uptodate < BTREE_ITER_NEED_RELOCK;
++}
++
++static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
++				  bool cached)
++{
++	return !cached
++		? container_of(_b, struct btree, c)->key.k.p
++		: container_of(_b, struct bkey_cached, c)->key.pos;
++}
++
++/* Slowpath: */
++int __bch2_btree_node_lock(struct btree_trans *trans,
++			   struct btree_path *path,
++			   struct btree *b,
++			   struct bpos pos, unsigned level,
++			   enum six_lock_type type,
++			   six_lock_should_sleep_fn should_sleep_fn, void *p,
++			   unsigned long ip)
++{
++	struct btree_path *linked;
++	unsigned reason;
++
++	/* Check if it's safe to block: */
++	trans_for_each_path(trans, linked) {
++		if (!linked->nodes_locked)
++			continue;
++
++		/*
++		 * Can't block taking an intent lock if we have _any_ nodes read
++		 * locked:
++		 *
++		 * - Our read lock blocks another thread with an intent lock on
++		 *   the same node from getting a write lock, and thus from
++		 *   dropping its intent lock
++		 *
++		 * - And the other thread may have multiple nodes intent locked:
++		 *   both the node we want to intent lock, and the node we
++		 *   already have read locked - deadlock:
++		 */
++		if (type == SIX_LOCK_intent &&
++		    linked->nodes_locked != linked->nodes_intent_locked) {
++			reason = 1;
++			goto deadlock;
++		}
++
++		if (linked->btree_id != path->btree_id) {
++			if (linked->btree_id < path->btree_id)
++				continue;
++
++			reason = 3;
++			goto deadlock;
++		}
++
++		/*
++		 * Within the same btree, non-cached paths come before cached
++		 * paths:
++		 */
++		if (linked->cached != path->cached) {
++			if (!linked->cached)
++				continue;
++
++			reason = 4;
++			goto deadlock;
++		}
++
++		/*
++		 * Interior nodes must be locked before their descendants: if
++		 * another path has possible descendants locked of the node
++		 * we're about to lock, it must have the ancestors locked too:
++		 */
++		if (level > __fls(linked->nodes_locked)) {
++			reason = 5;
++			goto deadlock;
++		}
++
++		/* Must lock btree nodes in key order: */
++		if (btree_node_locked(linked, level) &&
++		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
++						 linked->cached)) <= 0) {
++			reason = 7;
++			goto deadlock;
++		}
++	}
++
++	return btree_node_lock_type(trans, path, b, pos, level,
++				    type, should_sleep_fn, p);
++deadlock:
++	trace_trans_restart_would_deadlock(trans->fn, ip,
++			trans->in_traverse_all, reason,
++			linked->btree_id,
++			linked->cached,
++			&linked->pos,
++			path->btree_id,
++			path->cached,
++			&pos);
++	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
++}
++
++/* Btree iterator locking: */
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++
++static void bch2_btree_path_verify_locks(struct btree_path *path)
++{
++	unsigned l;
++
++	if (!path->nodes_locked) {
++		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
++		       btree_path_node(path, path->level));
++		return;
++	}
++
++	for (l = 0; btree_path_node(path, l); l++)
++		BUG_ON(btree_lock_want(path, l) !=
++		       btree_node_locked_type(path, l));
++}
++
++void bch2_trans_verify_locks(struct btree_trans *trans)
++{
++	struct btree_path *path;
++
++	trans_for_each_path(trans, path)
++		bch2_btree_path_verify_locks(path);
++}
++#else
++static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
++#endif
++
++/* Btree path locking: */
++
++/*
++ * Only for btree_cache.c - only relocks intent locks
++ */
++int bch2_btree_path_relock_intent(struct btree_trans *trans,
++				  struct btree_path *path)
++{
++	unsigned l;
++
++	for (l = path->level;
++	     l < path->locks_want && btree_path_node(path, l);
++	     l++) {
++		if (!bch2_btree_node_relock(trans, path, l)) {
++			__bch2_btree_path_unlock(trans, path);
++			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
++			trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
++						   path->btree_id, &path->pos);
++			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
++		}
++	}
++
++	return 0;
++}
++
++__flatten
++static int bch2_btree_path_relock(struct btree_trans *trans,
++			struct btree_path *path, unsigned long trace_ip)
++{
++	bool ret = btree_path_get_locks(trans, path, false);
++
++	if (!ret) {
++		trace_trans_restart_relock_path(trans->fn, trace_ip,
++						path->btree_id, &path->pos);
++		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
++	}
++
++	return 0;
++}
++
++bool __bch2_btree_path_upgrade(struct btree_trans *trans,
++			       struct btree_path *path,
++			       unsigned new_locks_want)
++{
++	struct btree_path *linked;
++
++	EBUG_ON(path->locks_want >= new_locks_want);
++
++	path->locks_want = new_locks_want;
++
++	if (btree_path_get_locks(trans, path, true))
++		return true;
++
++	/*
++	 * XXX: this is ugly - we'd prefer to not be mucking with other
++	 * iterators in the btree_trans here.
++	 *
++	 * On failure to upgrade the iterator, setting iter->locks_want and
++	 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
++	 * get the locks we want on transaction restart.
++	 *
++	 * But if this iterator was a clone, on transaction restart what we did
++	 * to this iterator isn't going to be preserved.
++	 *
++	 * Possibly we could add an iterator field for the parent iterator when
++	 * an iterator is a copy - for now, we'll just upgrade any other
++	 * iterators with the same btree id.
++	 *
++	 * The code below used to be needed to ensure ancestor nodes get locked
++	 * before interior nodes - now that's handled by
++	 * bch2_btree_path_traverse_all().
++	 */
++	if (!path->cached && !trans->in_traverse_all)
++		trans_for_each_path(trans, linked)
++			if (linked != path &&
++			    linked->cached == path->cached &&
++			    linked->btree_id == path->btree_id &&
++			    linked->locks_want < new_locks_want) {
++				linked->locks_want = new_locks_want;
++				btree_path_get_locks(trans, linked, true);
++			}
++
++	return false;
++}
++
++void __bch2_btree_path_downgrade(struct btree_trans *trans,
++				 struct btree_path *path,
++				 unsigned new_locks_want)
++{
++	unsigned l;
++
++	EBUG_ON(path->locks_want < new_locks_want);
++
++	path->locks_want = new_locks_want;
++
++	while (path->nodes_locked &&
++	       (l = __fls(path->nodes_locked)) >= path->locks_want) {
++		if (l > path->level) {
++			btree_node_unlock(trans, path, l);
++		} else {
++			if (btree_node_intent_locked(path, l)) {
++				six_lock_downgrade(&path->l[l].b->c.lock);
++				path->nodes_intent_locked ^= 1 << l;
++			}
++			break;
++		}
++	}
++
++	bch2_btree_path_verify_locks(path);
++}
++
++void bch2_trans_downgrade(struct btree_trans *trans)
++{
++	struct btree_path *path;
++
++	trans_for_each_path(trans, path)
++		bch2_btree_path_downgrade(trans, path);
++}
++
++/* Btree transaction locking: */
++
++int bch2_trans_relock(struct btree_trans *trans)
++{
++	struct btree_path *path;
++
++	if (unlikely(trans->restarted))
++		return -BCH_ERR_transaction_restart_relock;
++
++	trans_for_each_path(trans, path)
++		if (path->should_be_locked &&
++		    bch2_btree_path_relock(trans, path, _RET_IP_)) {
++			trace_trans_restart_relock(trans->fn, _RET_IP_,
++					path->btree_id, &path->pos);
++			BUG_ON(!trans->restarted);
++			return -BCH_ERR_transaction_restart_relock;
++		}
++	return 0;
++}
++
++void bch2_trans_unlock(struct btree_trans *trans)
++{
++	struct btree_path *path;
++
++	trans_for_each_path(trans, path)
++		__bch2_btree_path_unlock(trans, path);
++
++	/*
++	 * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
++	 * btree nodes, it implements its own walking:
++	 */
++	BUG_ON(!trans->is_initial_gc &&
++	       lock_class_is_held(&bch2_btree_node_lock_key));
++}
++
++/* Btree iterator: */
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++
++static void bch2_btree_path_verify_cached(struct btree_trans *trans,
++					  struct btree_path *path)
++{
++	struct bkey_cached *ck;
++	bool locked = btree_node_locked(path, 0);
++
++	if (!bch2_btree_node_relock(trans, path, 0))
++		return;
++
++	ck = (void *) path->l[0].b;
++	BUG_ON(ck->key.btree_id != path->btree_id ||
++	       bkey_cmp(ck->key.pos, path->pos));
++
++	if (!locked)
++		btree_node_unlock(trans, path, 0);
++}
++
++static void bch2_btree_path_verify_level(struct btree_trans *trans,
++				struct btree_path *path, unsigned level)
++{
++	struct btree_path_level *l;
++	struct btree_node_iter tmp;
++	bool locked;
++	struct bkey_packed *p, *k;
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++	struct printbuf buf3 = PRINTBUF;
++	const char *msg;
++
++	if (!bch2_debug_check_iterators)
++		return;
++
++	l	= &path->l[level];
++	tmp	= l->iter;
++	locked	= btree_node_locked(path, level);
++
++	if (path->cached) {
++		if (!level)
++			bch2_btree_path_verify_cached(trans, path);
++		return;
++	}
++
++	if (!btree_path_node(path, level))
++		return;
++
++	if (!bch2_btree_node_relock(trans, path, level))
++		return;
++
++	BUG_ON(!btree_path_pos_in_node(path, l->b));
++
++	bch2_btree_node_iter_verify(&l->iter, l->b);
++
++	/*
++	 * For interior nodes, the iterator will have skipped past deleted keys:
++	 */
++	p = level
++		? bch2_btree_node_iter_prev(&tmp, l->b)
++		: bch2_btree_node_iter_prev_all(&tmp, l->b);
++	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
++
++	if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
++		msg = "before";
++		goto err;
++	}
++
++	if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
++		msg = "after";
++		goto err;
++	}
++
++	if (!locked)
++		btree_node_unlock(trans, path, level);
++	return;
++err:
++	bch2_bpos_to_text(&buf1, path->pos);
++
++	if (p) {
++		struct bkey uk = bkey_unpack_key(l->b, p);
++		bch2_bkey_to_text(&buf2, &uk);
++	} else {
++		prt_printf(&buf2, "(none)");
++	}
++
++	if (k) {
++		struct bkey uk = bkey_unpack_key(l->b, k);
++		bch2_bkey_to_text(&buf3, &uk);
++	} else {
++		prt_printf(&buf3, "(none)");
++	}
++
++	panic("path should be %s key at level %u:\n"
++	      "path pos %s\n"
++	      "prev key %s\n"
++	      "cur  key %s\n",
++	      msg, level, buf1.buf, buf2.buf, buf3.buf);
++}
++
++static void bch2_btree_path_verify(struct btree_trans *trans,
++				   struct btree_path *path)
++{
++	struct bch_fs *c = trans->c;
++	unsigned i;
++
++	EBUG_ON(path->btree_id >= BTREE_ID_NR);
++
++	for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
++		if (!path->l[i].b) {
++			BUG_ON(!path->cached &&
++			       c->btree_roots[path->btree_id].b->c.level > i);
++			break;
++		}
++
++		bch2_btree_path_verify_level(trans, path, i);
++	}
++
++	bch2_btree_path_verify_locks(path);
++}
++
++void bch2_trans_verify_paths(struct btree_trans *trans)
++{
++	struct btree_path *path;
++
++	trans_for_each_path(trans, path)
++		bch2_btree_path_verify(trans, path);
++}
++
++static void bch2_btree_iter_verify(struct btree_iter *iter)
++{
++	struct btree_trans *trans = iter->trans;
++
++	BUG_ON(iter->btree_id >= BTREE_ID_NR);
++
++	BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
++
++	BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
++	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
++
++	BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
++	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
++	       !btree_type_has_snapshots(iter->btree_id));
++
++	if (iter->update_path)
++		bch2_btree_path_verify(trans, iter->update_path);
++	bch2_btree_path_verify(trans, iter->path);
++}
++
++static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
++{
++	BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
++	       !iter->pos.snapshot);
++
++	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
++	       iter->pos.snapshot != iter->snapshot);
++
++	BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
++	       bkey_cmp(iter->pos, iter->k.p) > 0);
++}
++
++static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
++{
++	struct btree_trans *trans = iter->trans;
++	struct btree_iter copy;
++	struct bkey_s_c prev;
++	int ret = 0;
++
++	if (!bch2_debug_check_iterators)
++		return 0;
++
++	if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
++		return 0;
++
++	if (bkey_err(k) || !k.k)
++		return 0;
++
++	BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
++					  iter->snapshot,
++					  k.k->p.snapshot));
++
++	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
++			     BTREE_ITER_NOPRESERVE|
++			     BTREE_ITER_ALL_SNAPSHOTS);
++	prev = bch2_btree_iter_prev(&copy);
++	if (!prev.k)
++		goto out;
++
++	ret = bkey_err(prev);
++	if (ret)
++		goto out;
++
++	if (!bkey_cmp(prev.k->p, k.k->p) &&
++	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
++				      prev.k->p.snapshot) > 0) {
++		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
++
++		bch2_bkey_to_text(&buf1, k.k);
++		bch2_bkey_to_text(&buf2, prev.k);
++
++		panic("iter snap %u\n"
++		      "k    %s\n"
++		      "prev %s\n",
++		      iter->snapshot,
++		      buf1.buf, buf2.buf);
++	}
++out:
++	bch2_trans_iter_exit(trans, &copy);
++	return ret;
++}
++
++void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
++			    struct bpos pos, bool key_cache)
++{
++	struct btree_path *path;
++	unsigned idx;
++	struct printbuf buf = PRINTBUF;
++
++	trans_for_each_path_inorder(trans, path, idx) {
++		int cmp = cmp_int(path->btree_id, id) ?:
++			cmp_int(path->cached, key_cache);
++
++		if (cmp > 0)
++			break;
++		if (cmp < 0)
++			continue;
++
++		if (!(path->nodes_locked & 1) ||
++		    !path->should_be_locked)
++			continue;
++
++		if (!key_cache) {
++			if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
++			    bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
++				return;
++		} else {
++			if (!bkey_cmp(pos, path->pos))
++				return;
++		}
++	}
++
++	bch2_dump_trans_paths_updates(trans);
++	bch2_bpos_to_text(&buf, pos);
++
++	panic("not locked: %s %s%s\n",
++	      bch2_btree_ids[id], buf.buf,
++	      key_cache ? " cached" : "");
++}
++
++#else
++
++static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
++						struct btree_path *path, unsigned l) {}
++static inline void bch2_btree_path_verify(struct btree_trans *trans,
++					  struct btree_path *path) {}
++static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
++static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
++static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
++
++#endif
++
++/* Btree path: fixups after btree updates */
++
++static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
++					struct btree *b,
++					struct bset_tree *t,
++					struct bkey_packed *k)
++{
++	struct btree_node_iter_set *set;
++
++	btree_node_iter_for_each(iter, set)
++		if (set->end == t->end_offset) {
++			set->k = __btree_node_key_to_offset(b, k);
++			bch2_btree_node_iter_sort(iter, b);
++			return;
++		}
++
++	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
++}
++
++static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
++					       struct btree *b,
++					       struct bkey_packed *where)
++{
++	struct btree_path_level *l = &path->l[b->c.level];
++
++	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
++		return;
++
++	if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
++		bch2_btree_node_iter_advance(&l->iter, l->b);
++}
++
++void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
++				      struct btree *b,
++				      struct bkey_packed *where)
++{
++	struct btree_path *path;
++
++	trans_for_each_path_with_node(trans, b, path) {
++		__bch2_btree_path_fix_key_modified(path, b, where);
++		bch2_btree_path_verify_level(trans, path, b->c.level);
++	}
++}
++
++static void __bch2_btree_node_iter_fix(struct btree_path *path,
++				       struct btree *b,
++				       struct btree_node_iter *node_iter,
++				       struct bset_tree *t,
++				       struct bkey_packed *where,
++				       unsigned clobber_u64s,
++				       unsigned new_u64s)
++{
++	const struct bkey_packed *end = btree_bkey_last(b, t);
++	struct btree_node_iter_set *set;
++	unsigned offset = __btree_node_key_to_offset(b, where);
++	int shift = new_u64s - clobber_u64s;
++	unsigned old_end = t->end_offset - shift;
++	unsigned orig_iter_pos = node_iter->data[0].k;
++	bool iter_current_key_modified =
++		orig_iter_pos >= offset &&
++		orig_iter_pos <= offset + clobber_u64s;
++
++	btree_node_iter_for_each(node_iter, set)
++		if (set->end == old_end)
++			goto found;
++
++	/* didn't find the bset in the iterator - might have to readd it: */
++	if (new_u64s &&
++	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
++		bch2_btree_node_iter_push(node_iter, b, where, end);
++		goto fixup_done;
++	} else {
++		/* Iterator is after key that changed */
++		return;
++	}
++found:
++	set->end = t->end_offset;
++
++	/* Iterator hasn't gotten to the key that changed yet: */
++	if (set->k < offset)
++		return;
++
++	if (new_u64s &&
++	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
++		set->k = offset;
++	} else if (set->k < offset + clobber_u64s) {
++		set->k = offset + new_u64s;
++		if (set->k == set->end)
++			bch2_btree_node_iter_set_drop(node_iter, set);
++	} else {
++		/* Iterator is after key that changed */
++		set->k = (int) set->k + shift;
++		return;
++	}
++
++	bch2_btree_node_iter_sort(node_iter, b);
++fixup_done:
++	if (node_iter->data[0].k != orig_iter_pos)
++		iter_current_key_modified = true;
++
++	/*
++	 * When a new key is added, and the node iterator now points to that
++	 * key, the iterator might have skipped past deleted keys that should
++	 * come after the key the iterator now points to. We have to rewind to
++	 * before those deleted keys - otherwise
++	 * bch2_btree_node_iter_prev_all() breaks:
++	 */
++	if (!bch2_btree_node_iter_end(node_iter) &&
++	    iter_current_key_modified &&
++	    b->c.level) {
++		struct bset_tree *t;
++		struct bkey_packed *k, *k2, *p;
++
++		k = bch2_btree_node_iter_peek_all(node_iter, b);
++
++		for_each_bset(b, t) {
++			bool set_pos = false;
++
++			if (node_iter->data[0].end == t->end_offset)
++				continue;
++
++			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
++
++			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
++			       bkey_iter_cmp(b, k, p) < 0) {
++				k2 = p;
++				set_pos = true;
++			}
++
++			if (set_pos)
++				btree_node_iter_set_set_pos(node_iter,
++							    b, t, k2);
++		}
++	}
++}
++
++void bch2_btree_node_iter_fix(struct btree_trans *trans,
++			      struct btree_path *path,
++			      struct btree *b,
++			      struct btree_node_iter *node_iter,
++			      struct bkey_packed *where,
++			      unsigned clobber_u64s,
++			      unsigned new_u64s)
++{
++	struct bset_tree *t = bch2_bkey_to_bset(b, where);
++	struct btree_path *linked;
++
++	if (node_iter != &path->l[b->c.level].iter) {
++		__bch2_btree_node_iter_fix(path, b, node_iter, t,
++					   where, clobber_u64s, new_u64s);
++
++		if (bch2_debug_check_iterators)
++			bch2_btree_node_iter_verify(node_iter, b);
++	}
++
++	trans_for_each_path_with_node(trans, b, linked) {
++		__bch2_btree_node_iter_fix(linked, b,
++					   &linked->l[b->c.level].iter, t,
++					   where, clobber_u64s, new_u64s);
++		bch2_btree_path_verify_level(trans, linked, b->c.level);
++	}
++}
++
++/* Btree path level: pointer to a particular btree node and node iter */
++
++static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
++						  struct btree_path_level *l,
++						  struct bkey *u,
++						  struct bkey_packed *k)
++{
++	if (unlikely(!k)) {
++		/*
++		 * signal to bch2_btree_iter_peek_slot() that we're currently at
++		 * a hole
++		 */
++		u->type = KEY_TYPE_deleted;
++		return bkey_s_c_null;
++	}
++
++	return bkey_disassemble(l->b, k, u);
++}
++
++static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
++							struct btree_path_level *l,
++							struct bkey *u)
++{
++	return __btree_iter_unpack(c, l, u,
++			bch2_btree_node_iter_peek_all(&l->iter, l->b));
++}
++
++static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
++						    struct btree_path *path,
++						    struct btree_path_level *l,
++						    struct bkey *u)
++{
++	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
++			bch2_btree_node_iter_peek(&l->iter, l->b));
++
++	path->pos = k.k ? k.k->p : l->b->key.k.p;
++	bch2_btree_path_verify_level(trans, path, l - path->l);
++	return k;
++}
++
++static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
++						    struct btree_path *path,
++						    struct btree_path_level *l,
++						    struct bkey *u)
++{
++	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
++			bch2_btree_node_iter_prev(&l->iter, l->b));
++
++	path->pos = k.k ? k.k->p : l->b->data->min_key;
++	bch2_btree_path_verify_level(trans, path, l - path->l);
++	return k;
++}
++
++static inline bool btree_path_advance_to_pos(struct btree_path *path,
++					     struct btree_path_level *l,
++					     int max_advance)
++{
++	struct bkey_packed *k;
++	int nr_advanced = 0;
++
++	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
++	       bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
++		if (max_advance > 0 && nr_advanced >= max_advance)
++			return false;
++
++		bch2_btree_node_iter_advance(&l->iter, l->b);
++		nr_advanced++;
++	}
++
++	return true;
++}
++
++/*
++ * Verify that iterator for parent node points to child node:
++ */
++static void btree_path_verify_new_node(struct btree_trans *trans,
++				       struct btree_path *path, struct btree *b)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_path_level *l;
++	unsigned plevel;
++	bool parent_locked;
++	struct bkey_packed *k;
++
++	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
++		return;
++
++	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
++		return;
++
++	plevel = b->c.level + 1;
++	if (!btree_path_node(path, plevel))
++		return;
++
++	parent_locked = btree_node_locked(path, plevel);
++
++	if (!bch2_btree_node_relock(trans, path, plevel))
++		return;
++
++	l = &path->l[plevel];
++	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
++	if (!k ||
++	    bkey_deleted(k) ||
++	    bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
++		struct printbuf buf1 = PRINTBUF;
++		struct printbuf buf2 = PRINTBUF;
++		struct printbuf buf3 = PRINTBUF;
++		struct printbuf buf4 = PRINTBUF;
++		struct bkey uk = bkey_unpack_key(b, k);
++
++		bch2_dump_btree_node(c, l->b);
++		bch2_bpos_to_text(&buf1, path->pos);
++		bch2_bkey_to_text(&buf2, &uk);
++		bch2_bpos_to_text(&buf3, b->data->min_key);
++		bch2_bpos_to_text(&buf3, b->data->max_key);
++		panic("parent iter doesn't point to new node:\n"
++		      "iter pos %s %s\n"
++		      "iter key %s\n"
++		      "new node %s-%s\n",
++		      bch2_btree_ids[path->btree_id],
++		      buf1.buf, buf2.buf, buf3.buf, buf4.buf);
++	}
++
++	if (!parent_locked)
++		btree_node_unlock(trans, path, plevel);
++}
++
++static inline void __btree_path_level_init(struct btree_path *path,
++					   unsigned level)
++{
++	struct btree_path_level *l = &path->l[level];
++
++	bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
++
++	/*
++	 * Iterators to interior nodes should always be pointed at the first non
++	 * whiteout:
++	 */
++	if (level)
++		bch2_btree_node_iter_peek(&l->iter, l->b);
++}
++
++static inline void btree_path_level_init(struct btree_trans *trans,
++					 struct btree_path *path,
++					 struct btree *b)
++{
++	BUG_ON(path->cached);
++
++	btree_path_verify_new_node(trans, path, b);
++
++	EBUG_ON(!btree_path_pos_in_node(path, b));
++	EBUG_ON(b->c.lock.state.seq & 1);
++
++	path->l[b->c.level].lock_seq = b->c.lock.state.seq;
++	path->l[b->c.level].b = b;
++	__btree_path_level_init(path, b->c.level);
++}
++
++/* Btree path: fixups after btree node updates: */
++
++/*
++ * A btree node is being replaced - update the iterator to point to the new
++ * node:
++ */
++void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
++{
++	struct btree_path *path;
++
++	trans_for_each_path(trans, path)
++		if (!path->cached &&
++		    btree_path_pos_in_node(path, b)) {
++			enum btree_node_locked_type t =
++				btree_lock_want(path, b->c.level);
++
++			if (path->nodes_locked &&
++			    t != BTREE_NODE_UNLOCKED) {
++				btree_node_unlock(trans, path, b->c.level);
++				six_lock_increment(&b->c.lock, t);
++				mark_btree_node_locked(trans, path, b->c.level, t);
++			}
++
++			btree_path_level_init(trans, path, b);
++		}
++}
++
++/*
++ * A btree node has been modified in such a way as to invalidate iterators - fix
++ * them:
++ */
++void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
++{
++	struct btree_path *path;
++
++	trans_for_each_path_with_node(trans, b, path)
++		__btree_path_level_init(path, b->c.level);
++}
++
++/* Btree path: traverse, set_pos: */
++
++static int lock_root_check_fn(struct six_lock *lock, void *p)
++{
++	struct btree *b = container_of(lock, struct btree, c.lock);
++	struct btree **rootp = p;
++
++	if (b != *rootp)
++		return BCH_ERR_lock_fail_root_changed;
++	return 0;
++}
++
++static inline int btree_path_lock_root(struct btree_trans *trans,
++				       struct btree_path *path,
++				       unsigned depth_want,
++				       unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++	struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
++	enum six_lock_type lock_type;
++	unsigned i;
++	int ret;
++
++	EBUG_ON(path->nodes_locked);
++
++	while (1) {
++		b = READ_ONCE(*rootp);
++		path->level = READ_ONCE(b->c.level);
++
++		if (unlikely(path->level < depth_want)) {
++			/*
++			 * the root is at a lower depth than the depth we want:
++			 * got to the end of the btree, or we're walking nodes
++			 * greater than some depth and there are no nodes >=
++			 * that depth
++			 */
++			path->level = depth_want;
++			for (i = path->level; i < BTREE_MAX_DEPTH; i++)
++				path->l[i].b = NULL;
++			return 1;
++		}
++
++		lock_type = __btree_lock_want(path, path->level);
++		ret = btree_node_lock(trans, path, b, SPOS_MAX,
++				      path->level, lock_type,
++				      lock_root_check_fn, rootp,
++				      trace_ip);
++		if (unlikely(ret)) {
++			if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
++				continue;
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				return ret;
++			BUG();
++		}
++
++		if (likely(b == READ_ONCE(*rootp) &&
++			   b->c.level == path->level &&
++			   !race_fault())) {
++			for (i = 0; i < path->level; i++)
++				path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
++			path->l[path->level].b = b;
++			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
++				path->l[i].b = NULL;
++
++			mark_btree_node_locked(trans, path, path->level, lock_type);
++			btree_path_level_init(trans, path, b);
++			return 0;
++		}
++
++		six_unlock_type(&b->c.lock, lock_type);
++	}
++}
++
++noinline
++static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_path_level *l = path_l(path);
++	struct btree_node_iter node_iter = l->iter;
++	struct bkey_packed *k;
++	struct bkey_buf tmp;
++	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
++		? (path->level > 1 ? 0 :  2)
++		: (path->level > 1 ? 1 : 16);
++	bool was_locked = btree_node_locked(path, path->level);
++	int ret = 0;
++
++	bch2_bkey_buf_init(&tmp);
++
++	while (nr && !ret) {
++		if (!bch2_btree_node_relock(trans, path, path->level))
++			break;
++
++		bch2_btree_node_iter_advance(&node_iter, l->b);
++		k = bch2_btree_node_iter_peek(&node_iter, l->b);
++		if (!k)
++			break;
++
++		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
++		ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
++					       path->level - 1);
++	}
++
++	if (!was_locked)
++		btree_node_unlock(trans, path, path->level);
++
++	bch2_bkey_buf_exit(&tmp, c);
++	return ret;
++}
++
++static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
++				 struct btree_and_journal_iter *jiter)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c k;
++	struct bkey_buf tmp;
++	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
++		? (path->level > 1 ? 0 :  2)
++		: (path->level > 1 ? 1 : 16);
++	bool was_locked = btree_node_locked(path, path->level);
++	int ret = 0;
++
++	bch2_bkey_buf_init(&tmp);
++
++	while (nr && !ret) {
++		if (!bch2_btree_node_relock(trans, path, path->level))
++			break;
++
++		bch2_btree_and_journal_iter_advance(jiter);
++		k = bch2_btree_and_journal_iter_peek(jiter);
++		if (!k.k)
++			break;
++
++		bch2_bkey_buf_reassemble(&tmp, c, k);
++		ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
++					       path->level - 1);
++	}
++
++	if (!was_locked)
++		btree_node_unlock(trans, path, path->level);
++
++	bch2_bkey_buf_exit(&tmp, c);
++	return ret;
++}
++
++static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
++					    struct btree_path *path,
++					    unsigned plevel, struct btree *b)
++{
++	struct btree_path_level *l = &path->l[plevel];
++	bool locked = btree_node_locked(path, plevel);
++	struct bkey_packed *k;
++	struct bch_btree_ptr_v2 *bp;
++
++	if (!bch2_btree_node_relock(trans, path, plevel))
++		return;
++
++	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
++	BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
++
++	bp = (void *) bkeyp_val(&l->b->format, k);
++	bp->mem_ptr = (unsigned long)b;
++
++	if (!locked)
++		btree_node_unlock(trans, path, plevel);
++}
++
++static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
++						     struct btree_path *path,
++						     unsigned flags,
++						     struct bkey_buf *out)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_path_level *l = path_l(path);
++	struct btree_and_journal_iter jiter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
++
++	k = bch2_btree_and_journal_iter_peek(&jiter);
++
++	bch2_bkey_buf_reassemble(out, c, k);
++
++	if (flags & BTREE_ITER_PREFETCH)
++		ret = btree_path_prefetch_j(trans, path, &jiter);
++
++	bch2_btree_and_journal_iter_exit(&jiter);
++	return ret;
++}
++
++static __always_inline int btree_path_down(struct btree_trans *trans,
++					   struct btree_path *path,
++					   unsigned flags,
++					   unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_path_level *l = path_l(path);
++	struct btree *b;
++	unsigned level = path->level - 1;
++	enum six_lock_type lock_type = __btree_lock_want(path, level);
++	bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
++	struct bkey_buf tmp;
++	int ret;
++
++	EBUG_ON(!btree_node_locked(path, path->level));
++
++	bch2_bkey_buf_init(&tmp);
++
++	if (unlikely(!replay_done)) {
++		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
++		if (ret)
++			goto err;
++	} else {
++		bch2_bkey_buf_unpack(&tmp, c, l->b,
++				 bch2_btree_node_iter_peek(&l->iter, l->b));
++
++		if (flags & BTREE_ITER_PREFETCH) {
++			ret = btree_path_prefetch(trans, path);
++			if (ret)
++				goto err;
++		}
++	}
++
++	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
++	ret = PTR_ERR_OR_ZERO(b);
++	if (unlikely(ret))
++		goto err;
++
++	mark_btree_node_locked(trans, path, level, lock_type);
++	btree_path_level_init(trans, path, b);
++
++	if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
++	    unlikely(b != btree_node_mem_ptr(tmp.k)))
++		btree_node_mem_ptr_set(trans, path, level + 1, b);
++
++	if (btree_node_read_locked(path, level + 1))
++		btree_node_unlock(trans, path, level + 1);
++	path->level = level;
++
++	bch2_btree_path_verify_locks(path);
++err:
++	bch2_bkey_buf_exit(&tmp, c);
++	return ret;
++}
++
++static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
++				   unsigned, unsigned long);
++
++static int bch2_btree_path_traverse_all(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_path *path;
++	unsigned long trace_ip = _RET_IP_;
++	int i, ret = 0;
++
++	if (trans->in_traverse_all)
++		return -BCH_ERR_transaction_restart_in_traverse_all;
++
++	trans->in_traverse_all = true;
++retry_all:
++	trans->restarted = 0;
++	trans->traverse_all_idx = U8_MAX;
++
++	trans_for_each_path(trans, path)
++		path->should_be_locked = false;
++
++	btree_trans_verify_sorted(trans);
++
++	for (i = trans->nr_sorted - 2; i >= 0; --i) {
++		struct btree_path *path1 = trans->paths + trans->sorted[i];
++		struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
++
++		if (path1->btree_id == path2->btree_id &&
++		    path1->locks_want < path2->locks_want)
++			__bch2_btree_path_upgrade(trans, path1, path2->locks_want);
++		else if (!path1->locks_want && path2->locks_want)
++			__bch2_btree_path_upgrade(trans, path1, 1);
++	}
++
++	bch2_trans_unlock(trans);
++	cond_resched();
++
++	if (unlikely(trans->memory_allocation_failure)) {
++		struct closure cl;
++
++		closure_init_stack(&cl);
++
++		do {
++			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
++			closure_sync(&cl);
++		} while (ret);
++	}
++
++	/* Now, redo traversals in correct order: */
++	trans->traverse_all_idx = 0;
++	while (trans->traverse_all_idx < trans->nr_sorted) {
++		path = trans->paths + trans->sorted[trans->traverse_all_idx];
++
++		/*
++		 * Traversing a path can cause another path to be added at about
++		 * the same position:
++		 */
++		if (path->uptodate) {
++			ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
++			    ret == -ENOMEM)
++				goto retry_all;
++			if (ret)
++				goto err;
++			BUG_ON(path->uptodate);
++		} else {
++			trans->traverse_all_idx++;
++		}
++	}
++
++	/*
++	 * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
++	 * and relock(), relock() won't relock since path->should_be_locked
++	 * isn't set yet, which is all fine
++	 */
++	trans_for_each_path(trans, path)
++		BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
++err:
++	bch2_btree_cache_cannibalize_unlock(c);
++
++	trans->in_traverse_all = false;
++
++	trace_trans_traverse_all(trans->fn, trace_ip);
++	return ret;
++}
++
++static inline bool btree_path_good_node(struct btree_trans *trans,
++					struct btree_path *path,
++					unsigned l, int check_pos)
++{
++	if (!is_btree_node(path, l) ||
++	    !bch2_btree_node_relock(trans, path, l))
++		return false;
++
++	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
++		return false;
++	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
++		return false;
++	return true;
++}
++
++static void btree_path_set_level_up(struct btree_trans *trans,
++				    struct btree_path *path)
++{
++	btree_node_unlock(trans, path, path->level);
++	path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
++	path->level++;
++	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
++}
++
++static void btree_path_set_level_down(struct btree_trans *trans,
++				      struct btree_path *path,
++				      unsigned new_level)
++{
++	unsigned l;
++
++	path->level = new_level;
++
++	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
++		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
++			btree_node_unlock(trans, path, l);
++
++	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
++	bch2_btree_path_verify(trans, path);
++}
++
++static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
++						     struct btree_path *path,
++						     int check_pos)
++{
++	unsigned i, l = path->level;
++
++	while (btree_path_node(path, l) &&
++	       !btree_path_good_node(trans, path, l, check_pos)) {
++		btree_node_unlock(trans, path, l);
++		path->l[l].b = BTREE_ITER_NO_NODE_UP;
++		l++;
++	}
++
++	/* If we need intent locks, take them too: */
++	for (i = l + 1;
++	     i < path->locks_want && btree_path_node(path, i);
++	     i++)
++		if (!bch2_btree_node_relock(trans, path, i))
++			while (l <= i) {
++				btree_node_unlock(trans, path, l);
++				path->l[l].b = BTREE_ITER_NO_NODE_UP;
++				l++;
++			}
++
++	return l;
++}
++
++/*
++ * This is the main state machine for walking down the btree - walks down to a
++ * specified depth
++ *
++ * Returns 0 on success, -EIO on error (error reading in a btree node).
++ *
++ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
++ * stashed in the iterator and returned from bch2_trans_exit().
++ */
++static int btree_path_traverse_one(struct btree_trans *trans,
++				   struct btree_path *path,
++				   unsigned flags,
++				   unsigned long trace_ip)
++{
++	unsigned depth_want = path->level;
++	int ret = trans->restarted;
++
++	if (unlikely(ret))
++		goto out;
++
++	/*
++	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
++	 * and re-traverse the path without a transaction restart:
++	 */
++	if (path->should_be_locked) {
++		ret = bch2_btree_path_relock(trans, path, trace_ip);
++		goto out;
++	}
++
++	if (path->cached) {
++		ret = bch2_btree_path_traverse_cached(trans, path, flags);
++		goto out;
++	}
++
++	if (unlikely(path->level >= BTREE_MAX_DEPTH))
++		goto out;
++
++	path->level = btree_path_up_until_good_node(trans, path, 0);
++
++	/*
++	 * Note: path->nodes[path->level] may be temporarily NULL here - that
++	 * would indicate to other code that we got to the end of the btree,
++	 * here it indicates that relocking the root failed - it's critical that
++	 * btree_path_lock_root() comes next and that it can't fail
++	 */
++	while (path->level > depth_want) {
++		ret = btree_path_node(path, path->level)
++			? btree_path_down(trans, path, flags, trace_ip)
++			: btree_path_lock_root(trans, path, depth_want, trace_ip);
++		if (unlikely(ret)) {
++			if (ret == 1) {
++				/*
++				 * No nodes at this level - got to the end of
++				 * the btree:
++				 */
++				ret = 0;
++				goto out;
++			}
++
++			__bch2_btree_path_unlock(trans, path);
++			path->level = depth_want;
++
++			if (ret == -EIO)
++				path->l[path->level].b =
++					BTREE_ITER_NO_NODE_ERROR;
++			else
++				path->l[path->level].b =
++					BTREE_ITER_NO_NODE_DOWN;
++			goto out;
++		}
++	}
++
++	path->uptodate = BTREE_ITER_UPTODATE;
++out:
++	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
++	bch2_btree_path_verify(trans, path);
++	return ret;
++}
++
++int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
++					  struct btree_path *path, unsigned flags)
++{
++	if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
++		unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
++		u64 mask = ~(~0ULL << restart_probability_bits);
++
++		if ((prandom_u32() & mask) == mask) {
++			trace_transaction_restart_injected(trans->fn, _RET_IP_);
++			return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
++		}
++	}
++
++	if (path->uptodate < BTREE_ITER_NEED_RELOCK)
++		return 0;
++
++	return  bch2_trans_cond_resched(trans) ?:
++		btree_path_traverse_one(trans, path, flags, _RET_IP_);
++}
++
++static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
++			    struct btree_path *src)
++{
++	unsigned i, offset = offsetof(struct btree_path, pos);
++
++	memcpy((void *) dst + offset,
++	       (void *) src + offset,
++	       sizeof(struct btree_path) - offset);
++
++	for (i = 0; i < BTREE_MAX_DEPTH; i++)
++		if (btree_node_locked(dst, i))
++			six_lock_increment(&dst->l[i].b->c.lock,
++					   __btree_lock_want(dst, i));
++
++	bch2_btree_path_check_sort(trans, dst, 0);
++}
++
++static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
++					   bool intent)
++{
++	struct btree_path *new = btree_path_alloc(trans, src);
++
++	btree_path_copy(trans, new, src);
++	__btree_path_get(new, intent);
++	return new;
++}
++
++inline struct btree_path * __must_check
++bch2_btree_path_make_mut(struct btree_trans *trans,
++			 struct btree_path *path, bool intent,
++			 unsigned long ip)
++{
++	if (path->ref > 1 || path->preserve) {
++		__btree_path_put(path, intent);
++		path = btree_path_clone(trans, path, intent);
++		path->preserve = false;
++#ifdef CONFIG_BCACHEFS_DEBUG
++		path->ip_allocated = ip;
++#endif
++		btree_trans_verify_sorted(trans);
++	}
++
++	path->should_be_locked = false;
++	return path;
++}
++
++struct btree_path * __must_check
++bch2_btree_path_set_pos(struct btree_trans *trans,
++		   struct btree_path *path, struct bpos new_pos,
++		   bool intent, unsigned long ip)
++{
++	int cmp = bpos_cmp(new_pos, path->pos);
++	unsigned l = path->level;
++
++	EBUG_ON(trans->restarted);
++	EBUG_ON(!path->ref);
++
++	if (!cmp)
++		return path;
++
++	path = bch2_btree_path_make_mut(trans, path, intent, ip);
++
++	path->pos = new_pos;
++
++	bch2_btree_path_check_sort(trans, path, cmp);
++
++	if (unlikely(path->cached)) {
++		btree_node_unlock(trans, path, 0);
++		path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
++		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
++		goto out;
++	}
++
++	l = btree_path_up_until_good_node(trans, path, cmp);
++
++	if (btree_path_node(path, l)) {
++		BUG_ON(!btree_node_locked(path, l));
++		/*
++		 * We might have to skip over many keys, or just a few: try
++		 * advancing the node iterator, and if we have to skip over too
++		 * many keys just reinit it (or if we're rewinding, since that
++		 * is expensive).
++		 */
++		if (cmp < 0 ||
++		    !btree_path_advance_to_pos(path, &path->l[l], 8))
++			__btree_path_level_init(path, l);
++	}
++
++	if (l != path->level) {
++		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
++		__bch2_btree_path_unlock(trans, path);
++	}
++out:
++	bch2_btree_path_verify(trans, path);
++	return path;
++}
++
++/* Btree path: main interface: */
++
++static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
++{
++	struct btree_path *next;
++
++	next = prev_btree_path(trans, path);
++	if (next && !btree_path_cmp(next, path))
++		return next;
++
++	next = next_btree_path(trans, path);
++	if (next && !btree_path_cmp(next, path))
++		return next;
++
++	return NULL;
++}
++
++static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
++{
++	struct btree_path *next;
++
++	next = prev_btree_path(trans, path);
++	if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
++		return next;
++
++	next = next_btree_path(trans, path);
++	if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
++		return next;
++
++	return NULL;
++}
++
++static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
++{
++	__bch2_btree_path_unlock(trans, path);
++	btree_path_list_remove(trans, path);
++	trans->paths_allocated &= ~(1ULL << path->idx);
++}
++
++void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
++{
++	struct btree_path *dup;
++
++	EBUG_ON(trans->paths + path->idx != path);
++	EBUG_ON(!path->ref);
++
++	if (!__btree_path_put(path, intent))
++		return;
++
++	/*
++	 * Perhaps instead we should check for duplicate paths in traverse_all:
++	 */
++	if (path->preserve &&
++	    (dup = have_path_at_pos(trans, path))) {
++		dup->preserve = true;
++		path->preserve = false;
++		goto free;
++	}
++
++	if (!path->preserve &&
++	    (dup = have_node_at_pos(trans, path)))
++		goto free;
++	return;
++free:
++	if (path->should_be_locked &&
++	    !btree_node_locked(dup, path->level))
++		return;
++
++	dup->should_be_locked |= path->should_be_locked;
++	__bch2_path_free(trans, path);
++}
++
++void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
++{
++	struct btree_insert_entry *i;
++
++	prt_printf(buf, "transaction updates for %s journal seq %llu",
++	       trans->fn, trans->journal_res.seq);
++	prt_newline(buf);
++	printbuf_indent_add(buf, 2);
++
++	trans_for_each_update(trans, i) {
++		struct bkey_s_c old = { &i->old_k, i->old_v };
++
++		prt_printf(buf, "update: btree=%s cached=%u %pS",
++		       bch2_btree_ids[i->btree_id],
++		       i->cached,
++		       (void *) i->ip_allocated);
++		prt_newline(buf);
++
++		prt_printf(buf, "  old ");
++		bch2_bkey_val_to_text(buf, trans->c, old);
++		prt_newline(buf);
++
++		prt_printf(buf, "  new ");
++		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
++		prt_newline(buf);
++	}
++
++	printbuf_indent_sub(buf, 2);
++}
++
++noinline __cold
++void bch2_dump_trans_updates(struct btree_trans *trans)
++{
++	struct printbuf buf = PRINTBUF;
++
++	bch2_trans_updates_to_text(&buf, trans);
++	bch_err(trans->c, "%s", buf.buf);
++	printbuf_exit(&buf);
++}
++
++noinline __cold
++void bch2_dump_trans_paths_updates(struct btree_trans *trans)
++{
++	struct btree_path *path;
++	struct printbuf buf = PRINTBUF;
++	unsigned idx;
++
++	trans_for_each_path_inorder(trans, path, idx) {
++		printbuf_reset(&buf);
++
++		bch2_bpos_to_text(&buf, path->pos);
++
++		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n",
++		       path->idx, path->ref, path->intent_ref,
++		       path->should_be_locked ? " S" : "",
++		       path->preserve ? " P" : "",
++		       bch2_btree_ids[path->btree_id],
++		       path->level,
++		       buf.buf,
++		       path->nodes_locked,
++#ifdef CONFIG_BCACHEFS_DEBUG
++		       (void *) path->ip_allocated
++#else
++		       NULL
++#endif
++		       );
++	}
++
++	printbuf_exit(&buf);
++
++	bch2_dump_trans_updates(trans);
++}
++
++static struct btree_path *btree_path_alloc(struct btree_trans *trans,
++					   struct btree_path *pos)
++{
++	struct btree_path *path;
++	unsigned idx;
++
++	if (unlikely(trans->paths_allocated ==
++		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
++		bch2_dump_trans_paths_updates(trans);
++		panic("trans path oveflow\n");
++	}
++
++	idx = __ffs64(~trans->paths_allocated);
++	trans->paths_allocated |= 1ULL << idx;
++
++	path = &trans->paths[idx];
++
++	path->idx		= idx;
++	path->ref		= 0;
++	path->intent_ref	= 0;
++	path->nodes_locked	= 0;
++	path->nodes_intent_locked = 0;
++
++	btree_path_list_add(trans, pos, path);
++	return path;
++}
++
++struct btree_path *bch2_path_get(struct btree_trans *trans,
++				 enum btree_id btree_id, struct bpos pos,
++				 unsigned locks_want, unsigned level,
++				 unsigned flags, unsigned long ip)
++{
++	struct btree_path *path, *path_pos = NULL;
++	bool cached = flags & BTREE_ITER_CACHED;
++	bool intent = flags & BTREE_ITER_INTENT;
++	int i;
++
++	BUG_ON(trans->restarted);
++	btree_trans_verify_sorted(trans);
++	bch2_trans_verify_locks(trans);
++
++	trans_for_each_path_inorder(trans, path, i) {
++		if (__btree_path_cmp(path,
++				     btree_id,
++				     cached,
++				     pos,
++				     level) > 0)
++			break;
++
++		path_pos = path;
++	}
++
++	if (path_pos &&
++	    path_pos->cached	== cached &&
++	    path_pos->btree_id	== btree_id &&
++	    path_pos->level	== level) {
++		__btree_path_get(path_pos, intent);
++		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
++	} else {
++		path = btree_path_alloc(trans, path_pos);
++		path_pos = NULL;
++
++		__btree_path_get(path, intent);
++		path->pos			= pos;
++		path->btree_id			= btree_id;
++		path->cached			= cached;
++		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
++		path->should_be_locked		= false;
++		path->level			= level;
++		path->locks_want		= locks_want;
++		path->nodes_locked		= 0;
++		path->nodes_intent_locked	= 0;
++		for (i = 0; i < ARRAY_SIZE(path->l); i++)
++			path->l[i].b		= BTREE_ITER_NO_NODE_INIT;
++#ifdef CONFIG_BCACHEFS_DEBUG
++		path->ip_allocated		= ip;
++#endif
++		btree_trans_verify_sorted(trans);
++	}
++
++	if (!(flags & BTREE_ITER_NOPRESERVE))
++		path->preserve = true;
++
++	if (path->intent_ref)
++		locks_want = max(locks_want, level + 1);
++
++	/*
++	 * If the path has locks_want greater than requested, we don't downgrade
++	 * it here - on transaction restart because btree node split needs to
++	 * upgrade locks, we might be putting/getting the iterator again.
++	 * Downgrading iterators only happens via bch2_trans_downgrade(), after
++	 * a successful transaction commit.
++	 */
++
++	locks_want = min(locks_want, BTREE_MAX_DEPTH);
++	if (locks_want > path->locks_want) {
++		path->locks_want = locks_want;
++		btree_path_get_locks(trans, path, true);
++	}
++
++	return path;
++}
++
++inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
++{
++
++	struct bkey_s_c k;
++
++	if (!path->cached) {
++		struct btree_path_level *l = path_l(path);
++		struct bkey_packed *_k;
++
++		EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
++
++		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
++		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
++
++		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
++
++		if (!k.k || bpos_cmp(path->pos, k.k->p))
++			goto hole;
++	} else {
++		struct bkey_cached *ck = (void *) path->l[0].b;
++
++		EBUG_ON(ck &&
++			(path->btree_id != ck->key.btree_id ||
++			 bkey_cmp(path->pos, ck->key.pos)));
++
++		/* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
++		if (unlikely(!ck || !ck->valid))
++			return bkey_s_c_null;
++
++		EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
++
++		*u = ck->k->k;
++		k = bkey_i_to_s_c(ck->k);
++	}
++
++	return k;
++hole:
++	bkey_init(u);
++	u->p = path->pos;
++	return (struct bkey_s_c) { u, NULL };
++}
++
++/* Btree iterators: */
++
++int __must_check
++__bch2_btree_iter_traverse(struct btree_iter *iter)
++{
++	return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
++}
++
++int __must_check
++bch2_btree_iter_traverse(struct btree_iter *iter)
++{
++	int ret;
++
++	iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
++					btree_iter_search_key(iter),
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
++
++	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
++	if (ret)
++		return ret;
++
++	iter->path->should_be_locked = true;
++	return 0;
++}
++
++/* Iterate across nodes (leaf and interior nodes) */
++
++struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
++{
++	struct btree_trans *trans = iter->trans;
++	struct btree *b = NULL;
++	int ret;
++
++	EBUG_ON(iter->path->cached);
++	bch2_btree_iter_verify(iter);
++
++	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
++	if (ret)
++		goto err;
++
++	b = btree_path_node(iter->path, iter->path->level);
++	if (!b)
++		goto out;
++
++	BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
++
++	bkey_init(&iter->k);
++	iter->k.p = iter->pos = b->key.k.p;
++
++	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
++	iter->path->should_be_locked = true;
++	BUG_ON(iter->path->uptodate);
++out:
++	bch2_btree_iter_verify_entry_exit(iter);
++	bch2_btree_iter_verify(iter);
++
++	return b;
++err:
++	b = ERR_PTR(ret);
++	goto out;
++}
++
++struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
++{
++	struct btree_trans *trans = iter->trans;
++	struct btree_path *path = iter->path;
++	struct btree *b = NULL;
++	int ret;
++
++	BUG_ON(trans->restarted);
++	EBUG_ON(iter->path->cached);
++	bch2_btree_iter_verify(iter);
++
++	/* already at end? */
++	if (!btree_path_node(path, path->level))
++		return NULL;
++
++	/* got to end? */
++	if (!btree_path_node(path, path->level + 1)) {
++		btree_path_set_level_up(trans, path);
++		return NULL;
++	}
++
++	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
++		__bch2_btree_path_unlock(trans, path);
++		path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
++		path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
++		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
++		trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
++					   path->btree_id, &path->pos);
++		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
++		goto err;
++	}
++
++	b = btree_path_node(path, path->level + 1);
++
++	if (!bpos_cmp(iter->pos, b->key.k.p)) {
++		btree_node_unlock(trans, path, path->level);
++		path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
++		path->level++;
++	} else {
++		/*
++		 * Haven't gotten to the end of the parent node: go back down to
++		 * the next child node
++		 */
++		path = iter->path =
++			bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
++					   iter->flags & BTREE_ITER_INTENT,
++					   btree_iter_ip_allocated(iter));
++
++		btree_path_set_level_down(trans, path, iter->min_depth);
++
++		ret = bch2_btree_path_traverse(trans, path, iter->flags);
++		if (ret)
++			goto err;
++
++		b = path->l[path->level].b;
++	}
++
++	bkey_init(&iter->k);
++	iter->k.p = iter->pos = b->key.k.p;
++
++	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
++	iter->path->should_be_locked = true;
++	BUG_ON(iter->path->uptodate);
++out:
++	bch2_btree_iter_verify_entry_exit(iter);
++	bch2_btree_iter_verify(iter);
++
++	return b;
++err:
++	b = ERR_PTR(ret);
++	goto out;
++}
++
++/* Iterate across keys (in leaf nodes only) */
++
++inline bool bch2_btree_iter_advance(struct btree_iter *iter)
++{
++	if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
++		struct bpos pos = iter->k.p;
++		bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
++			    ? bpos_cmp(pos, SPOS_MAX)
++			    : bkey_cmp(pos, SPOS_MAX)) != 0;
++
++		if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
++			pos = bkey_successor(iter, pos);
++		bch2_btree_iter_set_pos(iter, pos);
++		return ret;
++	} else {
++		if (!btree_path_node(iter->path, iter->path->level))
++			return true;
++
++		iter->advanced = true;
++		return false;
++	}
++}
++
++inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
++{
++	struct bpos pos = bkey_start_pos(&iter->k);
++	bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
++		    ? bpos_cmp(pos, POS_MIN)
++		    : bkey_cmp(pos, POS_MIN)) != 0;
++
++	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
++		pos = bkey_predecessor(iter, pos);
++	bch2_btree_iter_set_pos(iter, pos);
++	return ret;
++}
++
++static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
++						      enum btree_id btree_id,
++						      struct bpos pos)
++{
++	struct btree_insert_entry *i;
++	struct bkey_i *ret = NULL;
++
++	trans_for_each_update(trans, i) {
++		if (i->btree_id < btree_id)
++			continue;
++		if (i->btree_id > btree_id)
++			break;
++		if (bpos_cmp(i->k->k.p, pos) < 0)
++			continue;
++		if (i->key_cache_already_flushed)
++			continue;
++		if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
++			ret = i->k;
++	}
++
++	return ret;
++}
++
++struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
++				       struct btree_iter *iter,
++				       struct bpos start_pos,
++				       struct bpos end_pos)
++{
++	struct bkey_i *k;
++
++	if (bpos_cmp(start_pos, iter->journal_pos) < 0)
++		iter->journal_idx = 0;
++
++	k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0,
++					start_pos, end_pos,
++					&iter->journal_idx);
++
++	iter->journal_pos = k ? k->k.p : end_pos;
++	return k;
++}
++
++struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
++					    struct btree_iter *iter,
++					    struct bpos pos)
++{
++	return bch2_btree_journal_peek(trans, iter, pos, pos);
++}
++
++static noinline
++struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
++					 struct btree_iter *iter,
++					 struct bkey_s_c k)
++{
++	struct bkey_i *next_journal =
++		bch2_btree_journal_peek(trans, iter, iter->path->pos,
++				k.k ? k.k->p : iter->path->l[0].b->key.k.p);
++
++	if (next_journal) {
++		iter->k = next_journal->k;
++		k = bkey_i_to_s_c(next_journal);
++	}
++
++	return k;
++}
++
++/*
++ * Checks btree key cache for key at iter->pos and returns it if present, or
++ * bkey_s_c_null:
++ */
++static noinline
++struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
++{
++	struct btree_trans *trans = iter->trans;
++	struct bch_fs *c = trans->c;
++	struct bkey u;
++	int ret;
++
++	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
++		return bkey_s_c_null;
++
++	if (!iter->key_cache_path)
++		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
++						     iter->flags & BTREE_ITER_INTENT, 0,
++						     iter->flags|BTREE_ITER_CACHED,
++						     _THIS_IP_);
++
++	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
++
++	ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
++	if (unlikely(ret))
++		return bkey_s_c_err(ret);
++
++	iter->key_cache_path->should_be_locked = true;
++
++	return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
++}
++
++static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
++{
++	struct btree_trans *trans = iter->trans;
++	struct bkey_i *next_update;
++	struct bkey_s_c k, k2;
++	int ret;
++
++	EBUG_ON(iter->path->cached || iter->path->level);
++	bch2_btree_iter_verify(iter);
++
++	while (1) {
++		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
++
++		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
++		if (unlikely(ret)) {
++			/* ensure that iter->k is consistent with iter->pos: */
++			bch2_btree_iter_set_pos(iter, iter->pos);
++			k = bkey_s_c_err(ret);
++			goto out;
++		}
++
++		iter->path->should_be_locked = true;
++
++		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
++
++		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
++		    k.k &&
++		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
++			ret = bkey_err(k2);
++			if (ret) {
++				k = k2;
++				bch2_btree_iter_set_pos(iter, iter->pos);
++				goto out;
++			}
++
++			k = k2;
++			iter->k = *k.k;
++		}
++
++		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
++			k = btree_trans_peek_journal(trans, iter, k);
++
++		next_update = iter->flags & BTREE_ITER_WITH_UPDATES
++			? btree_trans_peek_updates(trans, iter->btree_id, search_key)
++			: NULL;
++		if (next_update &&
++		    bpos_cmp(next_update->k.p,
++			     k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
++			iter->k = next_update->k;
++			k = bkey_i_to_s_c(next_update);
++		}
++
++		if (k.k && bkey_deleted(k.k)) {
++			/*
++			 * If we've got a whiteout, and it's after the search
++			 * key, advance the search key to the whiteout instead
++			 * of just after the whiteout - it might be a btree
++			 * whiteout, with a real key at the same position, since
++			 * in the btree deleted keys sort before non deleted.
++			 */
++			search_key = bpos_cmp(search_key, k.k->p)
++				? k.k->p
++				: bpos_successor(k.k->p);
++			continue;
++		}
++
++		if (likely(k.k)) {
++			break;
++		} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
++			/* Advance to next leaf node: */
++			search_key = bpos_successor(iter->path->l[0].b->key.k.p);
++		} else {
++			/* End of btree: */
++			bch2_btree_iter_set_pos(iter, SPOS_MAX);
++			k = bkey_s_c_null;
++			goto out;
++		}
++	}
++out:
++	bch2_btree_iter_verify(iter);
++
++	return k;
++}
++
++/**
++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
++ * current position
++ */
++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
++{
++	struct btree_trans *trans = iter->trans;
++	struct bpos search_key = btree_iter_search_key(iter);
++	struct bkey_s_c k;
++	struct bpos iter_pos;
++	int ret;
++
++	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
++
++	if (iter->update_path) {
++		bch2_path_put(trans, iter->update_path,
++			      iter->flags & BTREE_ITER_INTENT);
++		iter->update_path = NULL;
++	}
++
++	bch2_btree_iter_verify_entry_exit(iter);
++
++	while (1) {
++		k = __bch2_btree_iter_peek(iter, search_key);
++		if (!k.k || bkey_err(k))
++			goto out;
++
++		/*
++		 * iter->pos should be mononotically increasing, and always be
++		 * equal to the key we just returned - except extents can
++		 * straddle iter->pos:
++		 */
++		if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
++			iter_pos = k.k->p;
++		else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
++			iter_pos = bkey_start_pos(k.k);
++		else
++			iter_pos = iter->pos;
++
++		if (bkey_cmp(iter_pos, end) > 0) {
++			bch2_btree_iter_set_pos(iter, end);
++			k = bkey_s_c_null;
++			goto out;
++		}
++
++		if (iter->update_path &&
++		    bkey_cmp(iter->update_path->pos, k.k->p)) {
++			bch2_path_put(trans, iter->update_path,
++				      iter->flags & BTREE_ITER_INTENT);
++			iter->update_path = NULL;
++		}
++
++		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
++		    (iter->flags & BTREE_ITER_INTENT) &&
++		    !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
++		    !iter->update_path) {
++			struct bpos pos = k.k->p;
++
++			if (pos.snapshot < iter->snapshot) {
++				search_key = bpos_successor(k.k->p);
++				continue;
++			}
++
++			pos.snapshot = iter->snapshot;
++
++			/*
++			 * advance, same as on exit for iter->path, but only up
++			 * to snapshot
++			 */
++			__btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
++			iter->update_path = iter->path;
++
++			iter->update_path = bch2_btree_path_set_pos(trans,
++						iter->update_path, pos,
++						iter->flags & BTREE_ITER_INTENT,
++						_THIS_IP_);
++		}
++
++		/*
++		 * We can never have a key in a leaf node at POS_MAX, so
++		 * we don't have to check these successor() calls:
++		 */
++		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
++		    !bch2_snapshot_is_ancestor(trans->c,
++					       iter->snapshot,
++					       k.k->p.snapshot)) {
++			search_key = bpos_successor(k.k->p);
++			continue;
++		}
++
++		if (bkey_whiteout(k.k) &&
++		    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
++			search_key = bkey_successor(iter, k.k->p);
++			continue;
++		}
++
++		break;
++	}
++
++	iter->pos = iter_pos;
++
++	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
++				iter->flags & BTREE_ITER_INTENT,
++				btree_iter_ip_allocated(iter));
++	BUG_ON(!iter->path->nodes_locked);
++out:
++	if (iter->update_path) {
++		if (iter->update_path->uptodate &&
++		    (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) {
++			k = bkey_s_c_err(ret);
++		} else {
++			BUG_ON(!(iter->update_path->nodes_locked & 1));
++			iter->update_path->should_be_locked = true;
++		}
++	}
++	iter->path->should_be_locked = true;
++
++	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
++		iter->pos.snapshot = iter->snapshot;
++
++	ret = bch2_btree_iter_verify_ret(iter, k);
++	if (unlikely(ret)) {
++		bch2_btree_iter_set_pos(iter, iter->pos);
++		k = bkey_s_c_err(ret);
++	}
++
++	bch2_btree_iter_verify_entry_exit(iter);
++
++	return k;
++}
++
++/**
++ * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
++ * to iterator's current position, returning keys from every level of the btree.
++ * For keys at different levels of the btree that compare equal, the key from
++ * the lower level (leaf) is returned first.
++ */
++struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
++{
++	struct btree_trans *trans = iter->trans;
++	struct bkey_s_c k;
++	int ret;
++
++	EBUG_ON(iter->path->cached);
++	bch2_btree_iter_verify(iter);
++	BUG_ON(iter->path->level < iter->min_depth);
++	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
++	EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
++
++	while (1) {
++		iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
++
++		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
++		if (unlikely(ret)) {
++			/* ensure that iter->k is consistent with iter->pos: */
++			bch2_btree_iter_set_pos(iter, iter->pos);
++			k = bkey_s_c_err(ret);
++			goto out;
++		}
++
++		/* Already at end? */
++		if (!btree_path_node(iter->path, iter->path->level)) {
++			k = bkey_s_c_null;
++			goto out;
++		}
++
++		k = btree_path_level_peek_all(trans->c,
++				&iter->path->l[iter->path->level], &iter->k);
++
++		/* Check if we should go up to the parent node: */
++		if (!k.k ||
++		    (iter->advanced &&
++		     !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
++			iter->pos = path_l(iter->path)->b->key.k.p;
++			btree_path_set_level_up(trans, iter->path);
++			iter->advanced = false;
++			continue;
++		}
++
++		/*
++		 * Check if we should go back down to a leaf:
++		 * If we're not in a leaf node, we only return the current key
++		 * if it exactly matches iter->pos - otherwise we first have to
++		 * go back to the leaf:
++		 */
++		if (iter->path->level != iter->min_depth &&
++		    (iter->advanced ||
++		     !k.k ||
++		     bpos_cmp(iter->pos, k.k->p))) {
++			btree_path_set_level_down(trans, iter->path, iter->min_depth);
++			iter->pos = bpos_successor(iter->pos);
++			iter->advanced = false;
++			continue;
++		}
++
++		/* Check if we should go to the next key: */
++		if (iter->path->level == iter->min_depth &&
++		    iter->advanced &&
++		    k.k &&
++		    !bpos_cmp(iter->pos, k.k->p)) {
++			iter->pos = bpos_successor(iter->pos);
++			iter->advanced = false;
++			continue;
++		}
++
++		if (iter->advanced &&
++		    iter->path->level == iter->min_depth &&
++		    bpos_cmp(k.k->p, iter->pos))
++			iter->advanced = false;
++
++		BUG_ON(iter->advanced);
++		BUG_ON(!k.k);
++		break;
++	}
++
++	iter->pos = k.k->p;
++out:
++	iter->path->should_be_locked = true;
++	bch2_btree_iter_verify(iter);
++
++	return k;
++}
++
++/**
++ * bch2_btree_iter_next: returns first key greater than iterator's current
++ * position
++ */
++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
++{
++	if (!bch2_btree_iter_advance(iter))
++		return bkey_s_c_null;
++
++	return bch2_btree_iter_peek(iter);
++}
++
++/**
++ * bch2_btree_iter_peek_prev: returns first key less than or equal to
++ * iterator's current position
++ */
++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
++{
++	struct btree_trans *trans = iter->trans;
++	struct bpos search_key = iter->pos;
++	struct btree_path *saved_path = NULL;
++	struct bkey_s_c k;
++	struct bkey saved_k;
++	const struct bch_val *saved_v;
++	int ret;
++
++	EBUG_ON(iter->path->cached || iter->path->level);
++	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
++
++	if (iter->flags & BTREE_ITER_WITH_JOURNAL)
++		return bkey_s_c_err(-EIO);
++
++	bch2_btree_iter_verify(iter);
++	bch2_btree_iter_verify_entry_exit(iter);
++
++	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
++		search_key.snapshot = U32_MAX;
++
++	while (1) {
++		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
++						iter->flags & BTREE_ITER_INTENT,
++						btree_iter_ip_allocated(iter));
++
++		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
++		if (unlikely(ret)) {
++			/* ensure that iter->k is consistent with iter->pos: */
++			bch2_btree_iter_set_pos(iter, iter->pos);
++			k = bkey_s_c_err(ret);
++			goto out;
++		}
++
++		k = btree_path_level_peek(trans, iter->path,
++					  &iter->path->l[0], &iter->k);
++		if (!k.k ||
++		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
++		     ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
++		     : bpos_cmp(k.k->p, search_key) > 0))
++			k = btree_path_level_prev(trans, iter->path,
++						  &iter->path->l[0], &iter->k);
++
++		bch2_btree_path_check_sort(trans, iter->path, 0);
++
++		if (likely(k.k)) {
++			if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
++				if (k.k->p.snapshot == iter->snapshot)
++					goto got_key;
++
++				/*
++				 * If we have a saved candidate, and we're no
++				 * longer at the same _key_ (not pos), return
++				 * that candidate
++				 */
++				if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
++					bch2_path_put(trans, iter->path,
++						      iter->flags & BTREE_ITER_INTENT);
++					iter->path = saved_path;
++					saved_path = NULL;
++					iter->k	= saved_k;
++					k.v	= saved_v;
++					goto got_key;
++				}
++
++				if (bch2_snapshot_is_ancestor(iter->trans->c,
++							      iter->snapshot,
++							      k.k->p.snapshot)) {
++					if (saved_path)
++						bch2_path_put(trans, saved_path,
++						      iter->flags & BTREE_ITER_INTENT);
++					saved_path = btree_path_clone(trans, iter->path,
++								iter->flags & BTREE_ITER_INTENT);
++					saved_k = *k.k;
++					saved_v = k.v;
++				}
++
++				search_key = bpos_predecessor(k.k->p);
++				continue;
++			}
++got_key:
++			if (bkey_whiteout(k.k) &&
++			    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
++				search_key = bkey_predecessor(iter, k.k->p);
++				if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
++					search_key.snapshot = U32_MAX;
++				continue;
++			}
++
++			break;
++		} else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
++			/* Advance to previous leaf node: */
++			search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
++		} else {
++			/* Start of btree: */
++			bch2_btree_iter_set_pos(iter, POS_MIN);
++			k = bkey_s_c_null;
++			goto out;
++		}
++	}
++
++	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
++
++	/* Extents can straddle iter->pos: */
++	if (bkey_cmp(k.k->p, iter->pos) < 0)
++		iter->pos = k.k->p;
++
++	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
++		iter->pos.snapshot = iter->snapshot;
++out:
++	if (saved_path)
++		bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
++	iter->path->should_be_locked = true;
++
++	bch2_btree_iter_verify_entry_exit(iter);
++	bch2_btree_iter_verify(iter);
++
++	return k;
++}
++
++/**
++ * bch2_btree_iter_prev: returns first key less than iterator's current
++ * position
++ */
++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
++{
++	if (!bch2_btree_iter_rewind(iter))
++		return bkey_s_c_null;
++
++	return bch2_btree_iter_peek_prev(iter);
++}
++
++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
++{
++	struct btree_trans *trans = iter->trans;
++	struct bpos search_key;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_btree_iter_verify(iter);
++	bch2_btree_iter_verify_entry_exit(iter);
++	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
++	EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
++
++	/* extents can't span inode numbers: */
++	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
++	    unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
++		if (iter->pos.inode == KEY_INODE_MAX)
++			return bkey_s_c_null;
++
++		bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
++	}
++
++	search_key = btree_iter_search_key(iter);
++	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
++
++	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
++	if (unlikely(ret))
++		return bkey_s_c_err(ret);
++
++	if ((iter->flags & BTREE_ITER_CACHED) ||
++	    !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
++		struct bkey_i *next_update;
++
++		if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
++		    (next_update = btree_trans_peek_updates(trans,
++						iter->btree_id, search_key)) &&
++		    !bpos_cmp(next_update->k.p, iter->pos)) {
++			iter->k = next_update->k;
++			k = bkey_i_to_s_c(next_update);
++			goto out;
++		}
++
++		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
++		    (next_update = bch2_btree_journal_peek_slot(trans,
++					iter, iter->pos))) {
++			iter->k = next_update->k;
++			k = bkey_i_to_s_c(next_update);
++			goto out;
++		}
++
++		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
++		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
++			if (!bkey_err(k))
++				iter->k = *k.k;
++			goto out;
++		}
++
++		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
++	} else {
++		struct bpos next;
++
++		EBUG_ON(iter->path->level);
++
++		if (iter->flags & BTREE_ITER_INTENT) {
++			struct btree_iter iter2;
++			struct bpos end = iter->pos;
++
++			if (iter->flags & BTREE_ITER_IS_EXTENTS)
++				end.offset = U64_MAX;
++
++			bch2_trans_copy_iter(&iter2, iter);
++			k = bch2_btree_iter_peek_upto(&iter2, end);
++
++			if (k.k && !bkey_err(k)) {
++				iter->k = iter2.k;
++				k.k = &iter->k;
++			}
++			bch2_trans_iter_exit(trans, &iter2);
++		} else {
++			struct bpos pos = iter->pos;
++
++			k = bch2_btree_iter_peek(iter);
++			iter->pos = pos;
++		}
++
++		if (unlikely(bkey_err(k)))
++			return k;
++
++		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
++
++		if (bkey_cmp(iter->pos, next) < 0) {
++			bkey_init(&iter->k);
++			iter->k.p = iter->pos;
++
++			if (iter->flags & BTREE_ITER_IS_EXTENTS) {
++				bch2_key_resize(&iter->k,
++						min_t(u64, KEY_SIZE_MAX,
++						      (next.inode == iter->pos.inode
++						       ? next.offset
++						       : KEY_OFFSET_MAX) -
++						      iter->pos.offset));
++				EBUG_ON(!iter->k.size);
++			}
++
++			k = (struct bkey_s_c) { &iter->k, NULL };
++		}
++	}
++out:
++	iter->path->should_be_locked = true;
++
++	bch2_btree_iter_verify_entry_exit(iter);
++	bch2_btree_iter_verify(iter);
++	ret = bch2_btree_iter_verify_ret(iter, k);
++	if (unlikely(ret))
++		return bkey_s_c_err(ret);
++
++	return k;
++}
++
++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
++{
++	if (!bch2_btree_iter_advance(iter))
++		return bkey_s_c_null;
++
++	return bch2_btree_iter_peek_slot(iter);
++}
++
++struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
++{
++	if (!bch2_btree_iter_rewind(iter))
++		return bkey_s_c_null;
++
++	return bch2_btree_iter_peek_slot(iter);
++}
++
++/* new transactional stuff: */
++
++static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
++						struct btree_path *path)
++{
++	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
++	EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
++	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
++}
++
++static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	unsigned i;
++
++	for (i = 0; i < trans->nr_sorted; i++)
++		btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
++#endif
++}
++
++static void btree_trans_verify_sorted(struct btree_trans *trans)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	struct btree_path *path, *prev = NULL;
++	unsigned i;
++
++	if (!bch2_debug_check_iterators)
++		return;
++
++	trans_for_each_path_inorder(trans, path, i) {
++		if (prev && btree_path_cmp(prev, path) > 0) {
++			bch2_dump_trans_paths_updates(trans);
++			panic("trans paths out of order!\n");
++		}
++		prev = path;
++	}
++#endif
++}
++
++static inline void btree_path_swap(struct btree_trans *trans,
++				   struct btree_path *l, struct btree_path *r)
++{
++	swap(l->sorted_idx, r->sorted_idx);
++	swap(trans->sorted[l->sorted_idx],
++	     trans->sorted[r->sorted_idx]);
++
++	btree_path_verify_sorted_ref(trans, l);
++	btree_path_verify_sorted_ref(trans, r);
++}
++
++inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
++				       int cmp)
++{
++	struct btree_path *n;
++
++	if (cmp <= 0) {
++		n = prev_btree_path(trans, path);
++		if (n && btree_path_cmp(n, path) > 0) {
++			do {
++				btree_path_swap(trans, n, path);
++				n = prev_btree_path(trans, path);
++			} while (n && btree_path_cmp(n, path) > 0);
++
++			goto out;
++		}
++	}
++
++	if (cmp >= 0) {
++		n = next_btree_path(trans, path);
++		if (n && btree_path_cmp(path, n) > 0) {
++			do {
++				btree_path_swap(trans, path, n);
++				n = next_btree_path(trans, path);
++			} while (n && btree_path_cmp(path, n) > 0);
++		}
++	}
++out:
++	btree_trans_verify_sorted(trans);
++}
++
++static inline void btree_path_list_remove(struct btree_trans *trans,
++					  struct btree_path *path)
++{
++	unsigned i;
++
++	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
++
++	array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
++
++	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
++		trans->paths[trans->sorted[i]].sorted_idx = i;
++
++	path->sorted_idx = U8_MAX;
++
++	btree_trans_verify_sorted_refs(trans);
++}
++
++static inline void btree_path_list_add(struct btree_trans *trans,
++				       struct btree_path *pos,
++				       struct btree_path *path)
++{
++	unsigned i;
++
++	btree_trans_verify_sorted_refs(trans);
++
++	path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
++
++	if (trans->in_traverse_all &&
++	    trans->traverse_all_idx != U8_MAX &&
++	    trans->traverse_all_idx >= path->sorted_idx)
++		trans->traverse_all_idx++;
++
++	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
++
++	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
++		trans->paths[trans->sorted[i]].sorted_idx = i;
++
++	btree_trans_verify_sorted_refs(trans);
++}
++
++void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
++{
++	if (iter->path)
++		bch2_path_put(trans, iter->path,
++			      iter->flags & BTREE_ITER_INTENT);
++	if (iter->update_path)
++		bch2_path_put(trans, iter->update_path,
++			      iter->flags & BTREE_ITER_INTENT);
++	if (iter->key_cache_path)
++		bch2_path_put(trans, iter->key_cache_path,
++			      iter->flags & BTREE_ITER_INTENT);
++	iter->path = NULL;
++	iter->update_path = NULL;
++	iter->key_cache_path = NULL;
++}
++
++static void __bch2_trans_iter_init(struct btree_trans *trans,
++				   struct btree_iter *iter,
++				   unsigned btree_id, struct bpos pos,
++				   unsigned locks_want,
++				   unsigned depth,
++				   unsigned flags,
++				   unsigned long ip)
++{
++	EBUG_ON(trans->restarted);
++
++	if (flags & BTREE_ITER_ALL_LEVELS)
++		flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
++
++	if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
++	    btree_node_type_is_extents(btree_id))
++		flags |= BTREE_ITER_IS_EXTENTS;
++
++	if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
++	    !btree_type_has_snapshots(btree_id))
++		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
++
++	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
++	    btree_type_has_snapshots(btree_id))
++		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
++
++	if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
++		flags |= BTREE_ITER_WITH_JOURNAL;
++
++	iter->trans	= trans;
++	iter->path	= NULL;
++	iter->update_path = NULL;
++	iter->key_cache_path = NULL;
++	iter->btree_id	= btree_id;
++	iter->min_depth	= depth;
++	iter->flags	= flags;
++	iter->snapshot	= pos.snapshot;
++	iter->pos	= pos;
++	iter->k.type	= KEY_TYPE_deleted;
++	iter->k.p	= pos;
++	iter->k.size	= 0;
++	iter->journal_idx = 0;
++	iter->journal_pos = POS_MIN;
++#ifdef CONFIG_BCACHEFS_DEBUG
++	iter->ip_allocated = ip;
++#endif
++
++	iter->path = bch2_path_get(trans, btree_id, iter->pos,
++				   locks_want, depth, flags, ip);
++}
++
++void bch2_trans_iter_init(struct btree_trans *trans,
++			  struct btree_iter *iter,
++			  unsigned btree_id, struct bpos pos,
++			  unsigned flags)
++{
++	if (!btree_id_cached(trans->c, btree_id)) {
++		flags &= ~BTREE_ITER_CACHED;
++		flags &= ~BTREE_ITER_WITH_KEY_CACHE;
++	} else if (!(flags & BTREE_ITER_CACHED))
++		flags |= BTREE_ITER_WITH_KEY_CACHE;
++
++	__bch2_trans_iter_init(trans, iter, btree_id, pos,
++			       0, 0, flags, _RET_IP_);
++}
++
++void bch2_trans_node_iter_init(struct btree_trans *trans,
++			       struct btree_iter *iter,
++			       enum btree_id btree_id,
++			       struct bpos pos,
++			       unsigned locks_want,
++			       unsigned depth,
++			       unsigned flags)
++{
++	__bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
++			       BTREE_ITER_NOT_EXTENTS|
++			       __BTREE_ITER_ALL_SNAPSHOTS|
++			       BTREE_ITER_ALL_SNAPSHOTS|
++			       flags, _RET_IP_);
++	BUG_ON(iter->path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
++	BUG_ON(iter->path->level	!= depth);
++	BUG_ON(iter->min_depth		!= depth);
++}
++
++void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
++{
++	*dst = *src;
++	if (src->path)
++		__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
++	if (src->update_path)
++		__btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
++	dst->key_cache_path = NULL;
++}
++
++void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
++{
++	size_t new_top = trans->mem_top + size;
++	void *p;
++
++	if (new_top > trans->mem_bytes) {
++		size_t old_bytes = trans->mem_bytes;
++		size_t new_bytes = roundup_pow_of_two(new_top);
++		void *new_mem;
++
++		WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
++
++		new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
++		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
++			new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
++			new_bytes = BTREE_TRANS_MEM_MAX;
++			kfree(trans->mem);
++		}
++
++		if (!new_mem)
++			return ERR_PTR(-ENOMEM);
++
++		trans->mem = new_mem;
++		trans->mem_bytes = new_bytes;
++
++		if (old_bytes) {
++			trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
++			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
++		}
++	}
++
++	p = trans->mem + trans->mem_top;
++	trans->mem_top += size;
++	memset(p, 0, size);
++	return p;
++}
++
++/**
++ * bch2_trans_begin() - reset a transaction after a interrupted attempt
++ * @trans: transaction to reset
++ *
++ * While iterating over nodes or updating nodes a attempt to lock a btree node
++ * may return BCH_ERR_transaction_restart when the trylock fails. When this
++ * occurs bch2_trans_begin() should be called and the transaction retried.
++ */
++u32 bch2_trans_begin(struct btree_trans *trans)
++{
++	struct btree_path *path;
++
++	bch2_trans_reset_updates(trans);
++
++	trans->mem_top			= 0;
++
++	if (trans->fs_usage_deltas) {
++		trans->fs_usage_deltas->used = 0;
++		memset((void *) trans->fs_usage_deltas +
++		       offsetof(struct replicas_delta_list, memset_start), 0,
++		       (void *) &trans->fs_usage_deltas->memset_end -
++		       (void *) &trans->fs_usage_deltas->memset_start);
++	}
++
++	trans_for_each_path(trans, path) {
++		path->should_be_locked = false;
++
++		/*
++		 * If the transaction wasn't restarted, we're presuming to be
++		 * doing something new: dont keep iterators excpt the ones that
++		 * are in use - except for the subvolumes btree:
++		 */
++		if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
++			path->preserve = false;
++
++		/*
++		 * XXX: we probably shouldn't be doing this if the transaction
++		 * was restarted, but currently we still overflow transaction
++		 * iterators if we do that
++		 */
++		if (!path->ref && !path->preserve)
++			__bch2_path_free(trans, path);
++		else
++			path->preserve = false;
++	}
++
++	if (!trans->restarted &&
++	    (need_resched() ||
++	     ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
++		bch2_trans_unlock(trans);
++		cond_resched();
++		bch2_trans_relock(trans);
++	}
++
++	trans->last_restarted_ip = _RET_IP_;
++	if (trans->restarted)
++		bch2_btree_path_traverse_all(trans);
++
++	trans->last_begin_time = ktime_get_ns();
++	return trans->restart_count;
++}
++
++void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
++{
++	bch2_trans_inconsistent_on(trans_was_restarted(trans, restart_count), trans,
++		"trans->restart_count %u, should be %u, last restarted by %ps\n",
++		trans->restart_count, restart_count,
++		(void *) trans->last_restarted_ip);
++}
++
++static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
++{
++	size_t paths_bytes	= sizeof(struct btree_path) * BTREE_ITER_MAX;
++	size_t updates_bytes	= sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
++	void *p = NULL;
++
++	BUG_ON(trans->used_mempool);
++
++#ifdef __KERNEL__
++	p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
++#endif
++	if (!p)
++		p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
++
++	trans->paths		= p; p += paths_bytes;
++	trans->updates		= p; p += updates_bytes;
++}
++
++void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
++		       unsigned expected_nr_iters,
++		       size_t expected_mem_bytes,
++		       const char *fn)
++	__acquires(&c->btree_trans_barrier)
++{
++	struct btree_trans *pos;
++
++	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
++
++	memset(trans, 0, sizeof(*trans));
++	trans->c		= c;
++	trans->fn		= fn;
++	trans->last_begin_time	= ktime_get_ns();
++	trans->task		= current;
++
++	while (c->lock_held_stats.names[trans->lock_name_idx] != fn
++	       && c->lock_held_stats.names[trans->lock_name_idx] != 0)
++		trans->lock_name_idx++;
++
++	if (trans->lock_name_idx >= BCH_LOCK_TIME_NR)
++		pr_warn_once("lock_times array not big enough!");
++	else
++		c->lock_held_stats.names[trans->lock_name_idx] = fn;
++
++	bch2_trans_alloc_paths(trans, c);
++
++	if (expected_mem_bytes) {
++		trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
++		trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
++
++		if (!unlikely(trans->mem)) {
++			trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
++			trans->mem_bytes = BTREE_TRANS_MEM_MAX;
++		}
++	}
++
++	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
++
++	mutex_lock(&c->btree_trans_lock);
++	list_for_each_entry(pos, &c->btree_trans_list, list) {
++		if (trans->task->pid < pos->task->pid) {
++			list_add_tail(&trans->list, &pos->list);
++			goto list_add_done;
++		}
++	}
++	list_add_tail(&trans->list, &c->btree_trans_list);
++list_add_done:
++	mutex_unlock(&c->btree_trans_lock);
++}
++
++static void check_btree_paths_leaked(struct btree_trans *trans)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	struct bch_fs *c = trans->c;
++	struct btree_path *path;
++
++	trans_for_each_path(trans, path)
++		if (path->ref)
++			goto leaked;
++	return;
++leaked:
++	bch_err(c, "btree paths leaked from %s!", trans->fn);
++	trans_for_each_path(trans, path)
++		if (path->ref)
++			printk(KERN_ERR "  btree %s %pS\n",
++			       bch2_btree_ids[path->btree_id],
++			       (void *) path->ip_allocated);
++	/* Be noisy about this: */
++	bch2_fatal_error(c);
++#endif
++}
++
++void bch2_trans_exit(struct btree_trans *trans)
++	__releases(&c->btree_trans_barrier)
++{
++	struct btree_insert_entry *i;
++	struct bch_fs *c = trans->c;
++
++	bch2_trans_unlock(trans);
++
++	trans_for_each_update(trans, i)
++		__btree_path_put(i->path, true);
++	trans->nr_updates		= 0;
++
++	check_btree_paths_leaked(trans);
++
++	mutex_lock(&c->btree_trans_lock);
++	list_del(&trans->list);
++	mutex_unlock(&c->btree_trans_lock);
++
++	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
++
++	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
++
++	kfree(trans->extra_journal_entries.data);
++
++	if (trans->fs_usage_deltas) {
++		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
++		    REPLICAS_DELTA_LIST_MAX)
++			mempool_free(trans->fs_usage_deltas,
++				     &c->replicas_delta_pool);
++		else
++			kfree(trans->fs_usage_deltas);
++	}
++
++	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
++		mempool_free(trans->mem, &c->btree_trans_mem_pool);
++	else
++		kfree(trans->mem);
++
++#ifdef __KERNEL__
++	/*
++	 * Userspace doesn't have a real percpu implementation:
++	 */
++	trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
++#endif
++
++	if (trans->paths)
++		mempool_free(trans->paths, &c->btree_paths_pool);
++
++	trans->mem	= (void *) 0x1;
++	trans->paths	= (void *) 0x1;
++}
++
++static void __maybe_unused
++bch2_btree_path_node_to_text(struct printbuf *out,
++			     struct btree_bkey_cached_common *_b,
++			     bool cached)
++{
++	prt_printf(out, "    l=%u %s:",
++	       _b->level, bch2_btree_ids[_b->btree_id]);
++	bch2_bpos_to_text(out, btree_node_pos(_b, cached));
++}
++
++void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
++{
++	struct btree_path *path;
++	struct btree *b;
++	static char lock_types[] = { 'r', 'i', 'w' };
++	unsigned l;
++
++	prt_printf(out, "%i %s\n", trans->task->pid, trans->fn);
++
++	trans_for_each_path(trans, path) {
++		if (!path->nodes_locked)
++			continue;
++
++		prt_printf(out, "  path %u %c l=%u %s:",
++		       path->idx,
++		       path->cached ? 'c' : 'b',
++		       path->level,
++		       bch2_btree_ids[path->btree_id]);
++		bch2_bpos_to_text(out, path->pos);
++		prt_printf(out, "\n");
++
++		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
++			if (btree_node_locked(path, l)) {
++				prt_printf(out, "    %s l=%u ",
++				       btree_node_intent_locked(path, l) ? "i" : "r", l);
++				bch2_btree_path_node_to_text(out,
++						(void *) path->l[l].b,
++						path->cached);
++				prt_printf(out, "\n");
++			}
++		}
++	}
++
++	b = READ_ONCE(trans->locking);
++	if (b) {
++		path = &trans->paths[trans->locking_path_idx];
++		prt_printf(out, "  locking path %u %c l=%u %c %s:",
++		       trans->locking_path_idx,
++		       path->cached ? 'c' : 'b',
++		       trans->locking_level,
++		       lock_types[trans->locking_lock_type],
++		       bch2_btree_ids[trans->locking_btree_id]);
++		bch2_bpos_to_text(out, trans->locking_pos);
++
++		prt_printf(out, " node ");
++		bch2_btree_path_node_to_text(out,
++				(void *) b, path->cached);
++		prt_printf(out, "\n");
++	}
++}
++
++void bch2_fs_btree_iter_exit(struct bch_fs *c)
++{
++	if (c->btree_trans_barrier_initialized)
++		cleanup_srcu_struct(&c->btree_trans_barrier);
++	mempool_exit(&c->btree_trans_mem_pool);
++	mempool_exit(&c->btree_paths_pool);
++}
++
++int bch2_fs_btree_iter_init(struct bch_fs *c)
++{
++	unsigned nr = BTREE_ITER_MAX;
++	int ret;
++
++	INIT_LIST_HEAD(&c->btree_trans_list);
++	mutex_init(&c->btree_trans_lock);
++
++	ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
++			sizeof(struct btree_path) * nr +
++			sizeof(struct btree_insert_entry) * nr) ?:
++		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
++					  BTREE_TRANS_MEM_MAX) ?:
++		init_srcu_struct(&c->btree_trans_barrier);
++	if (!ret)
++		c->btree_trans_barrier_initialized = true;
++	return ret;
++}
+diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
+new file mode 100644
+index 000000000000..1b02f75d4cab
+--- /dev/null
++++ b/fs/bcachefs/btree_iter.h
+@@ -0,0 +1,556 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BTREE_ITER_H
++#define _BCACHEFS_BTREE_ITER_H
++
++#include "bset.h"
++#include "btree_types.h"
++
++#include <trace/events/bcachefs.h>
++
++static inline void __btree_path_get(struct btree_path *path, bool intent)
++{
++	path->ref++;
++	path->intent_ref += intent;
++}
++
++static inline bool __btree_path_put(struct btree_path *path, bool intent)
++{
++	EBUG_ON(!path->ref);
++	EBUG_ON(!path->intent_ref && intent);
++	path->intent_ref -= intent;
++	return --path->ref == 0;
++}
++
++static inline void btree_path_set_dirty(struct btree_path *path,
++					enum btree_path_uptodate u)
++{
++	path->uptodate = max_t(unsigned, path->uptodate, u);
++}
++
++static inline struct btree *btree_path_node(struct btree_path *path,
++					    unsigned level)
++{
++	return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
++}
++
++static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
++					const struct btree *b, unsigned level)
++{
++	/*
++	 * We don't compare the low bits of the lock sequence numbers because
++	 * @path might have taken a write lock on @b, and we don't want to skip
++	 * the linked path if the sequence numbers were equal before taking that
++	 * write lock. The lock sequence number is incremented by taking and
++	 * releasing write locks and is even when unlocked:
++	 */
++	return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
++}
++
++static inline struct btree *btree_node_parent(struct btree_path *path,
++					      struct btree *b)
++{
++	return btree_path_node(path, b->c.level + 1);
++}
++
++/* Iterate over paths within a transaction: */
++
++static inline struct btree_path *
++__trans_next_path(struct btree_trans *trans, unsigned idx)
++{
++	u64 l;
++
++	if (idx == BTREE_ITER_MAX)
++		return NULL;
++
++	l = trans->paths_allocated >> idx;
++	if (!l)
++		return NULL;
++
++	idx += __ffs64(l);
++	EBUG_ON(idx >= BTREE_ITER_MAX);
++	EBUG_ON(trans->paths[idx].idx != idx);
++	return &trans->paths[idx];
++}
++
++void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
++
++#define trans_for_each_path(_trans, _path)				\
++	for (_path = __trans_next_path((_trans), 0);			\
++	     (_path);							\
++	     _path = __trans_next_path((_trans), (_path)->idx + 1))
++
++static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
++{
++	unsigned idx = path ? path->sorted_idx + 1 : 0;
++
++	EBUG_ON(idx > trans->nr_sorted);
++
++	return idx < trans->nr_sorted
++		? trans->paths + trans->sorted[idx]
++		: NULL;
++}
++
++static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
++{
++	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
++	return path->sorted_idx
++		? trans->paths + trans->sorted[path->sorted_idx - 1]
++		: NULL;
++}
++
++#define trans_for_each_path_inorder(_trans, _path, _i)			\
++	for (_i = 0;							\
++	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
++	     _i++)
++
++static inline bool __path_has_node(const struct btree_path *path,
++				   const struct btree *b)
++{
++	return path->l[b->c.level].b == b &&
++		btree_node_lock_seq_matches(path, b, b->c.level);
++}
++
++static inline struct btree_path *
++__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
++			    unsigned idx)
++{
++	struct btree_path *path = __trans_next_path(trans, idx);
++
++	while (path && !__path_has_node(path, b))
++		path = __trans_next_path(trans, path->idx + 1);
++
++	return path;
++}
++
++#define trans_for_each_path_with_node(_trans, _b, _path)		\
++	for (_path = __trans_next_path_with_node((_trans), (_b), 0);	\
++	     (_path);							\
++	     _path = __trans_next_path_with_node((_trans), (_b),	\
++						 (_path)->idx + 1))
++
++struct btree_path * __must_check
++bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
++			 bool, unsigned long);
++struct btree_path * __must_check
++bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
++			struct bpos, bool, unsigned long);
++int __must_check bch2_btree_path_traverse(struct btree_trans *,
++					  struct btree_path *, unsigned);
++struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
++				 unsigned, unsigned, unsigned, unsigned long);
++inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
++
++struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
++					struct btree_iter *, struct bpos);
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++void bch2_trans_verify_paths(struct btree_trans *);
++void bch2_trans_verify_locks(struct btree_trans *);
++void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
++			    struct bpos, bool);
++#else
++static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
++static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
++static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
++					  struct bpos pos, bool key_cache) {}
++#endif
++
++void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
++				      struct btree *, struct bkey_packed *);
++void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
++			      struct btree *, struct btree_node_iter *,
++			      struct bkey_packed *, unsigned, unsigned);
++
++int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
++
++void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
++
++int bch2_trans_relock(struct btree_trans *);
++void bch2_trans_unlock(struct btree_trans *);
++
++static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
++{
++	return restart_count != trans->restart_count;
++}
++
++void bch2_trans_verify_not_restarted(struct btree_trans *, u32);
++
++__always_inline
++static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
++{
++	BUG_ON(err <= 0);
++	BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
++
++	trans->restarted = err;
++	trans->restart_count++;
++	return -err;
++}
++
++__always_inline
++static inline int btree_trans_restart(struct btree_trans *trans, int err)
++{
++	btree_trans_restart_nounlock(trans, err);
++	return -err;
++}
++
++bool bch2_btree_node_upgrade(struct btree_trans *,
++			     struct btree_path *, unsigned);
++
++bool __bch2_btree_path_upgrade(struct btree_trans *,
++			       struct btree_path *, unsigned);
++
++static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
++					   struct btree_path *path,
++					   unsigned new_locks_want)
++{
++	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
++
++	return path->locks_want < new_locks_want
++		? __bch2_btree_path_upgrade(trans, path, new_locks_want)
++		: path->uptodate == BTREE_ITER_UPTODATE;
++}
++
++void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
++
++static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
++					     struct btree_path *path)
++{
++	unsigned new_locks_want = path->level + !!path->intent_ref;
++
++	if (path->locks_want > new_locks_want)
++		__bch2_btree_path_downgrade(trans, path, new_locks_want);
++}
++
++void bch2_trans_downgrade(struct btree_trans *);
++
++void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
++void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
++
++int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
++int __must_check bch2_btree_iter_traverse(struct btree_iter *);
++
++struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
++struct btree *bch2_btree_iter_next_node(struct btree_iter *);
++
++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
++
++struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
++
++static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
++{
++	return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
++}
++
++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
++
++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
++struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
++
++bool bch2_btree_iter_advance(struct btree_iter *);
++bool bch2_btree_iter_rewind(struct btree_iter *);
++
++static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
++{
++	iter->k.type = KEY_TYPE_deleted;
++	iter->k.p.inode		= iter->pos.inode	= new_pos.inode;
++	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
++	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
++	iter->k.size = 0;
++}
++
++static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
++{
++	if (unlikely(iter->update_path))
++		bch2_path_put(iter->trans, iter->update_path,
++			      iter->flags & BTREE_ITER_INTENT);
++	iter->update_path = NULL;
++
++	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
++		new_pos.snapshot = iter->snapshot;
++
++	__bch2_btree_iter_set_pos(iter, new_pos);
++}
++
++static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
++{
++	BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
++	iter->pos = bkey_start_pos(&iter->k);
++}
++
++static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
++{
++	struct bpos pos = iter->pos;
++
++	iter->snapshot = snapshot;
++	pos.snapshot = snapshot;
++	bch2_btree_iter_set_pos(iter, pos);
++}
++
++void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
++void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
++			  unsigned, struct bpos, unsigned);
++void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
++			       enum btree_id, struct bpos,
++			       unsigned, unsigned, unsigned);
++void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
++
++static inline void set_btree_iter_dontneed(struct btree_iter *iter)
++{
++	iter->path->preserve = false;
++}
++
++void *bch2_trans_kmalloc(struct btree_trans *, size_t);
++u32 bch2_trans_begin(struct btree_trans *);
++
++static inline struct btree *
++__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
++{
++	struct btree *b;
++
++	while (b = bch2_btree_iter_peek_node(iter),
++	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
++		bch2_trans_begin(trans);
++
++	return b;
++}
++
++#define __for_each_btree_node(_trans, _iter, _btree_id, _start,		\
++			      _locks_want, _depth, _flags, _b, _ret)	\
++	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
++				_start, _locks_want, _depth, _flags);	\
++	     (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\
++	     !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b);			\
++	     (_b) = bch2_btree_iter_next_node(&(_iter)))
++
++#define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
++			    _flags, _b, _ret)				\
++	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
++			      0, 0, _flags, _b, _ret)
++
++static inline int bkey_err(struct bkey_s_c k)
++{
++	return PTR_ERR_OR_ZERO(k.k);
++}
++
++static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
++							     unsigned flags)
++{
++	BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
++
++	return  flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
++						bch2_btree_iter_peek_prev(iter);
++}
++
++static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
++							unsigned flags)
++{
++	return  flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
++		flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
++						bch2_btree_iter_peek(iter);
++}
++
++static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
++							     struct bpos end,
++							     unsigned flags)
++{
++	if (!(flags & BTREE_ITER_SLOTS))
++		return bch2_btree_iter_peek_upto(iter, end);
++
++	if (bkey_cmp(iter->pos, end) > 0)
++		return bkey_s_c_null;
++
++	return bch2_btree_iter_peek_slot(iter);
++}
++
++static inline int btree_trans_too_many_iters(struct btree_trans *trans)
++{
++	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX) {
++		trace_trans_restart_too_many_iters(trans->fn, _THIS_IP_);
++		return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
++	}
++
++	return 0;
++}
++
++static inline struct bkey_s_c
++__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
++				   struct btree_iter *iter, unsigned flags)
++{
++	struct bkey_s_c k;
++
++	while (btree_trans_too_many_iters(trans) ||
++	       (k = bch2_btree_iter_peek_type(iter, flags),
++		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
++		bch2_trans_begin(trans);
++
++	return k;
++}
++
++#define lockrestart_do(_trans, _do)					\
++({									\
++	u32 _restart_count;						\
++	int _ret;							\
++									\
++	do {								\
++		_restart_count = bch2_trans_begin(_trans);		\
++		_ret = (_do);						\
++	} while (bch2_err_matches(_ret, BCH_ERR_transaction_restart));	\
++									\
++	if (!_ret)							\
++		bch2_trans_verify_not_restarted(_trans, _restart_count);\
++									\
++	_ret;								\
++})
++
++/*
++ * nested_lockrestart_do(), nested_commit_do():
++ *
++ * These are like lockrestart_do() and commit_do(), with two differences:
++ *
++ *  - We don't call bch2_trans_begin() unless we had a transaction restart
++ *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
++ *  transaction restart
++ */
++#define nested_lockrestart_do(_trans, _do)				\
++({									\
++	u32 _restart_count, _orig_restart_count;			\
++	int _ret;							\
++									\
++	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
++									\
++	while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
++		_restart_count = bch2_trans_begin(_trans);		\
++									\
++	if (!_ret)							\
++		bch2_trans_verify_not_restarted(_trans, _restart_count);\
++									\
++	if (!_ret && trans_was_restarted(_trans, _orig_restart_count))	\
++		_ret = -BCH_ERR_transaction_restart_nested;		\
++									\
++	_ret;								\
++})
++
++#define for_each_btree_key2(_trans, _iter, _btree_id,			\
++			    _start, _flags, _k, _do)			\
++({									\
++	int _ret = 0;							\
++									\
++	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
++			     (_start), (_flags));			\
++									\
++	while (1) {							\
++		u32 _restart_count = bch2_trans_begin(_trans);		\
++		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
++		if (!(_k).k) {						\
++			_ret = 0;					\
++			break;						\
++		}							\
++									\
++		_ret = bkey_err(_k) ?: (_do);				\
++		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
++			continue;					\
++		if (_ret)						\
++			break;						\
++		bch2_trans_verify_not_restarted(_trans, _restart_count);\
++		if (!bch2_btree_iter_advance(&(_iter)))			\
++			break;						\
++	}								\
++									\
++	bch2_trans_iter_exit((_trans), &(_iter));			\
++	_ret;								\
++})
++
++#define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
++				   _start, _flags, _k, _do)		\
++({									\
++	int _ret = 0;							\
++									\
++	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
++			     (_start), (_flags));			\
++									\
++	while (1) {							\
++		u32 _restart_count = bch2_trans_begin(_trans);		\
++		(_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
++		if (!(_k).k) {						\
++			_ret = 0;					\
++			break;						\
++		}							\
++									\
++		_ret = bkey_err(_k) ?: (_do);				\
++		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
++			continue;					\
++		if (_ret)						\
++			break;						\
++		bch2_trans_verify_not_restarted(_trans, _restart_count);\
++		if (!bch2_btree_iter_rewind(&(_iter)))			\
++			break;						\
++	}								\
++									\
++	bch2_trans_iter_exit((_trans), &(_iter));			\
++	_ret;								\
++})
++
++#define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
++				  _start, _iter_flags, _k,		\
++				  _disk_res, _journal_seq, _commit_flags,\
++				  _do)					\
++	for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
++			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
++					(_journal_seq), (_commit_flags)))
++
++#define for_each_btree_key(_trans, _iter, _btree_id,			\
++			   _start, _flags, _k, _ret)			\
++	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
++				  (_start), (_flags));			\
++	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
++	     !((_ret) = bkey_err(_k)) && (_k).k;			\
++	     bch2_btree_iter_advance(&(_iter)))
++
++#define for_each_btree_key_norestart(_trans, _iter, _btree_id,		\
++			   _start, _flags, _k, _ret)			\
++	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
++				  (_start), (_flags));			\
++	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
++	     !((_ret) = bkey_err(_k)) && (_k).k;			\
++	     bch2_btree_iter_advance(&(_iter)))
++
++#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,	\
++			   _start, _end, _flags, _k, _ret)		\
++	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
++				  (_start), (_flags));			\
++	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
++	     !((_ret) = bkey_err(_k)) && (_k).k;			\
++	     bch2_btree_iter_advance(&(_iter)))
++
++#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret)	\
++	for (;								\
++	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
++	     !((_ret) = bkey_err(_k)) && (_k).k;			\
++	     bch2_btree_iter_advance(&(_iter)))
++
++#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
++	for (;								\
++	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
++	     !((_ret) = bkey_err(_k)) && (_k).k;			\
++	     bch2_btree_iter_advance(&(_iter)))
++
++/* new multiple iterator interface: */
++
++void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
++void bch2_dump_trans_updates(struct btree_trans *);
++void bch2_dump_trans_paths_updates(struct btree_trans *);
++void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
++		       unsigned, size_t, const char *);
++void bch2_trans_exit(struct btree_trans *);
++
++#define bch2_trans_init(...)	__bch2_trans_init(__VA_ARGS__, __func__)
++
++void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
++
++void bch2_fs_btree_iter_exit(struct bch_fs *);
++int bch2_fs_btree_iter_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_BTREE_ITER_H */
+diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
+new file mode 100644
+index 000000000000..661006e427f2
+--- /dev/null
++++ b/fs/bcachefs/btree_key_cache.c
+@@ -0,0 +1,855 @@
++
++#include "bcachefs.h"
++#include "btree_cache.h"
++#include "btree_iter.h"
++#include "btree_key_cache.h"
++#include "btree_locking.h"
++#include "btree_update.h"
++#include "errcode.h"
++#include "error.h"
++#include "journal.h"
++#include "journal_reclaim.h"
++
++#include <linux/sched/mm.h>
++#include <trace/events/bcachefs.h>
++
++static struct kmem_cache *bch2_key_cache;
++
++static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
++				       const void *obj)
++{
++	const struct bkey_cached *ck = obj;
++	const struct bkey_cached_key *key = arg->key;
++
++	return cmp_int(ck->key.btree_id, key->btree_id) ?:
++		bpos_cmp(ck->key.pos, key->pos);
++}
++
++static const struct rhashtable_params bch2_btree_key_cache_params = {
++	.head_offset	= offsetof(struct bkey_cached, hash),
++	.key_offset	= offsetof(struct bkey_cached, key),
++	.key_len	= sizeof(struct bkey_cached_key),
++	.obj_cmpfn	= bch2_btree_key_cache_cmp_fn,
++};
++
++__flatten
++inline struct bkey_cached *
++bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
++{
++	struct bkey_cached_key key = {
++		.btree_id	= btree_id,
++		.pos		= pos,
++	};
++
++	return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
++				      bch2_btree_key_cache_params);
++}
++
++static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
++{
++	if (!six_trylock_intent(&ck->c.lock))
++		return false;
++
++	if (!six_trylock_write(&ck->c.lock)) {
++		six_unlock_intent(&ck->c.lock);
++		return false;
++	}
++
++	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
++		six_unlock_write(&ck->c.lock);
++		six_unlock_intent(&ck->c.lock);
++		return false;
++	}
++
++	return true;
++}
++
++static void bkey_cached_evict(struct btree_key_cache *c,
++			      struct bkey_cached *ck)
++{
++	BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
++				      bch2_btree_key_cache_params));
++	memset(&ck->key, ~0, sizeof(ck->key));
++
++	atomic_long_dec(&c->nr_keys);
++}
++
++static void bkey_cached_free(struct btree_key_cache *bc,
++			     struct bkey_cached *ck)
++{
++	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
++
++	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
++
++	ck->btree_trans_barrier_seq =
++		start_poll_synchronize_srcu(&c->btree_trans_barrier);
++
++	list_move_tail(&ck->list, &bc->freed);
++	atomic_long_inc(&bc->nr_freed);
++
++	kfree(ck->k);
++	ck->k		= NULL;
++	ck->u64s	= 0;
++
++	six_unlock_write(&ck->c.lock);
++	six_unlock_intent(&ck->c.lock);
++}
++
++static void bkey_cached_free_fast(struct btree_key_cache *bc,
++				  struct bkey_cached *ck)
++{
++	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
++	struct btree_key_cache_freelist *f;
++	bool freed = false;
++
++	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
++
++	ck->btree_trans_barrier_seq =
++		start_poll_synchronize_srcu(&c->btree_trans_barrier);
++
++	list_del_init(&ck->list);
++	atomic_long_inc(&bc->nr_freed);
++
++	kfree(ck->k);
++	ck->k		= NULL;
++	ck->u64s	= 0;
++
++	preempt_disable();
++	f = this_cpu_ptr(bc->pcpu_freed);
++
++	if (f->nr < ARRAY_SIZE(f->objs)) {
++		f->objs[f->nr++] = ck;
++		freed = true;
++	}
++	preempt_enable();
++
++	if (!freed) {
++		mutex_lock(&bc->lock);
++		preempt_disable();
++		f = this_cpu_ptr(bc->pcpu_freed);
++
++		while (f->nr > ARRAY_SIZE(f->objs) / 2) {
++			struct bkey_cached *ck2 = f->objs[--f->nr];
++
++			list_move_tail(&ck2->list, &bc->freed);
++		}
++		preempt_enable();
++
++		list_move_tail(&ck->list, &bc->freed);
++		mutex_unlock(&bc->lock);
++	}
++
++	six_unlock_write(&ck->c.lock);
++	six_unlock_intent(&ck->c.lock);
++}
++
++static struct bkey_cached *
++bkey_cached_alloc(struct btree_key_cache *c)
++{
++	struct bkey_cached *ck = NULL;
++	struct btree_key_cache_freelist *f;
++
++	preempt_disable();
++	f = this_cpu_ptr(c->pcpu_freed);
++	if (f->nr)
++		ck = f->objs[--f->nr];
++	preempt_enable();
++
++	if (!ck) {
++		mutex_lock(&c->lock);
++		preempt_disable();
++		f = this_cpu_ptr(c->pcpu_freed);
++
++		while (!list_empty(&c->freed) &&
++		       f->nr < ARRAY_SIZE(f->objs) / 2) {
++			ck = list_last_entry(&c->freed, struct bkey_cached, list);
++			list_del_init(&ck->list);
++			f->objs[f->nr++] = ck;
++		}
++
++		ck = f->nr ? f->objs[--f->nr] : NULL;
++		preempt_enable();
++		mutex_unlock(&c->lock);
++	}
++
++	if (ck) {
++		six_lock_intent(&ck->c.lock, NULL, NULL);
++		six_lock_write(&ck->c.lock, NULL, NULL);
++		return ck;
++	}
++
++	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
++	if (likely(ck)) {
++		INIT_LIST_HEAD(&ck->list);
++		six_lock_init(&ck->c.lock);
++		BUG_ON(!six_trylock_intent(&ck->c.lock));
++		BUG_ON(!six_trylock_write(&ck->c.lock));
++		return ck;
++	}
++
++	return NULL;
++}
++
++static struct bkey_cached *
++bkey_cached_reuse(struct btree_key_cache *c)
++{
++	struct bucket_table *tbl;
++	struct rhash_head *pos;
++	struct bkey_cached *ck;
++	unsigned i;
++
++	rcu_read_lock();
++	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
++	for (i = 0; i < tbl->size; i++)
++		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
++			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
++			    bkey_cached_lock_for_evict(ck)) {
++				bkey_cached_evict(c, ck);
++				rcu_read_unlock();
++				return ck;
++			}
++		}
++	rcu_read_unlock();
++
++	return NULL;
++}
++
++static struct bkey_cached *
++btree_key_cache_create(struct bch_fs *c,
++		       enum btree_id btree_id,
++		       struct bpos pos)
++{
++	struct btree_key_cache *bc = &c->btree_key_cache;
++	struct bkey_cached *ck;
++	bool was_new = true;
++
++	ck = bkey_cached_alloc(bc);
++
++	if (unlikely(!ck)) {
++		ck = bkey_cached_reuse(bc);
++		if (unlikely(!ck)) {
++			bch_err(c, "error allocating memory for key cache item, btree %s",
++				bch2_btree_ids[btree_id]);
++			return ERR_PTR(-ENOMEM);
++		}
++
++		was_new = false;
++	} else {
++		if (btree_id == BTREE_ID_subvolumes)
++			six_lock_pcpu_alloc(&ck->c.lock);
++		else
++			six_lock_pcpu_free(&ck->c.lock);
++	}
++
++	ck->c.level		= 0;
++	ck->c.btree_id		= btree_id;
++	ck->key.btree_id	= btree_id;
++	ck->key.pos		= pos;
++	ck->valid		= false;
++	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
++
++	if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
++					  &ck->hash,
++					  bch2_btree_key_cache_params))) {
++		/* We raced with another fill: */
++
++		if (likely(was_new)) {
++			six_unlock_write(&ck->c.lock);
++			six_unlock_intent(&ck->c.lock);
++			kfree(ck);
++		} else {
++			bkey_cached_free_fast(bc, ck);
++		}
++
++		return NULL;
++	}
++
++	atomic_long_inc(&bc->nr_keys);
++
++	six_unlock_write(&ck->c.lock);
++
++	return ck;
++}
++
++static int btree_key_cache_fill(struct btree_trans *trans,
++				struct btree_path *ck_path,
++				struct bkey_cached *ck)
++{
++	struct btree_path *path;
++	struct bkey_s_c k;
++	unsigned new_u64s = 0;
++	struct bkey_i *new_k = NULL;
++	struct bkey u;
++	int ret;
++
++	path = bch2_path_get(trans, ck->key.btree_id,
++			     ck->key.pos, 0, 0, 0, _THIS_IP_);
++	ret = bch2_btree_path_traverse(trans, path, 0);
++	if (ret)
++		goto err;
++
++	k = bch2_btree_path_peek_slot(path, &u);
++
++	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
++		trace_trans_restart_relock_key_cache_fill(trans->fn,
++				_THIS_IP_, ck_path->btree_id, &ck_path->pos);
++		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
++		goto err;
++	}
++
++	/*
++	 * bch2_varint_decode can read past the end of the buffer by at
++	 * most 7 bytes (it won't be used):
++	 */
++	new_u64s = k.k->u64s + 1;
++
++	/*
++	 * Allocate some extra space so that the transaction commit path is less
++	 * likely to have to reallocate, since that requires a transaction
++	 * restart:
++	 */
++	new_u64s = min(256U, (new_u64s * 3) / 2);
++
++	if (new_u64s > ck->u64s) {
++		new_u64s = roundup_pow_of_two(new_u64s);
++		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
++		if (!new_k) {
++			bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
++				bch2_btree_ids[ck->key.btree_id], new_u64s);
++			ret = -ENOMEM;
++			goto err;
++		}
++	}
++
++	/*
++	 * XXX: not allowed to be holding read locks when we take a write lock,
++	 * currently
++	 */
++	bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
++	if (new_k) {
++		kfree(ck->k);
++		ck->u64s = new_u64s;
++		ck->k = new_k;
++	}
++
++	bkey_reassemble(ck->k, k);
++	ck->valid = true;
++	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
++
++	/* We're not likely to need this iterator again: */
++	path->preserve = false;
++err:
++	bch2_path_put(trans, path, 0);
++	return ret;
++}
++
++static int bkey_cached_check_fn(struct six_lock *lock, void *p)
++{
++	struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
++	const struct btree_path *path = p;
++
++	if (ck->key.btree_id != path->btree_id &&
++	    bpos_cmp(ck->key.pos, path->pos))
++		return BCH_ERR_lock_fail_node_reused;
++	return 0;
++}
++
++__flatten
++int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
++				    unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_cached *ck;
++	int ret = 0;
++
++	BUG_ON(path->level);
++
++	path->l[1].b = NULL;
++
++	if (bch2_btree_node_relock(trans, path, 0)) {
++		ck = (void *) path->l[0].b;
++		goto fill;
++	}
++retry:
++	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
++	if (!ck) {
++		if (flags & BTREE_ITER_CACHED_NOCREATE) {
++			path->l[0].b = NULL;
++			return 0;
++		}
++
++		ck = btree_key_cache_create(c, path->btree_id, path->pos);
++		ret = PTR_ERR_OR_ZERO(ck);
++		if (ret)
++			goto err;
++		if (!ck)
++			goto retry;
++
++		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
++		path->locks_want = 1;
++	} else {
++		enum six_lock_type lock_want = __btree_lock_want(path, 0);
++
++		ret = btree_node_lock(trans, path, (void *) ck, path->pos, 0,
++				      lock_want,
++				      bkey_cached_check_fn, path, _THIS_IP_);
++		if (ret) {
++			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
++				goto retry;
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				goto err;
++			BUG();
++		}
++
++		if (ck->key.btree_id != path->btree_id ||
++		    bpos_cmp(ck->key.pos, path->pos)) {
++			six_unlock_type(&ck->c.lock, lock_want);
++			goto retry;
++		}
++
++		mark_btree_node_locked(trans, path, 0, lock_want);
++	}
++
++	path->l[0].lock_seq	= ck->c.lock.state.seq;
++	path->l[0].b		= (void *) ck;
++fill:
++	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
++		if (!path->locks_want &&
++		    !__bch2_btree_path_upgrade(trans, path, 1)) {
++			trace_transaction_restart_ip(trans->fn, _THIS_IP_);
++			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
++			goto err;
++		}
++
++		ret = btree_key_cache_fill(trans, path, ck);
++		if (ret)
++			goto err;
++	}
++
++	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
++		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
++
++	path->uptodate = BTREE_ITER_UPTODATE;
++	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
++
++	return ret;
++err:
++	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
++		btree_node_unlock(trans, path, 0);
++		path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
++	}
++	return ret;
++}
++
++static int btree_key_cache_flush_pos(struct btree_trans *trans,
++				     struct bkey_cached_key key,
++				     u64 journal_seq,
++				     unsigned commit_flags,
++				     bool evict)
++{
++	struct bch_fs *c = trans->c;
++	struct journal *j = &c->journal;
++	struct btree_iter c_iter, b_iter;
++	struct bkey_cached *ck = NULL;
++	int ret;
++
++	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
++			     BTREE_ITER_SLOTS|
++			     BTREE_ITER_INTENT|
++			     BTREE_ITER_ALL_SNAPSHOTS);
++	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
++			     BTREE_ITER_CACHED|
++			     BTREE_ITER_CACHED_NOFILL|
++			     BTREE_ITER_CACHED_NOCREATE|
++			     BTREE_ITER_INTENT);
++	b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
++
++	ret = bch2_btree_iter_traverse(&c_iter);
++	if (ret)
++		goto out;
++
++	ck = (void *) c_iter.path->l[0].b;
++	if (!ck)
++		goto out;
++
++	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
++		if (evict)
++			goto evict;
++		goto out;
++	}
++
++	BUG_ON(!ck->valid);
++
++	if (journal_seq && ck->journal.seq != journal_seq)
++		goto out;
++
++	/*
++	 * Since journal reclaim depends on us making progress here, and the
++	 * allocator/copygc depend on journal reclaim making progress, we need
++	 * to be using alloc reserves:
++	 * */
++	ret   = bch2_btree_iter_traverse(&b_iter) ?:
++		bch2_trans_update(trans, &b_iter, ck->k,
++				  BTREE_UPDATE_KEY_CACHE_RECLAIM|
++				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
++				  BTREE_TRIGGER_NORUN) ?:
++		bch2_trans_commit(trans, NULL, NULL,
++				  BTREE_INSERT_NOCHECK_RW|
++				  BTREE_INSERT_NOFAIL|
++				  BTREE_INSERT_USE_RESERVE|
++				  (ck->journal.seq == journal_last_seq(j)
++				   ? JOURNAL_WATERMARK_reserved
++				   : 0)|
++				  commit_flags);
++
++	bch2_fs_fatal_err_on(ret &&
++			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
++			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
++			     !bch2_journal_error(j), c,
++			     "error flushing key cache: %s", bch2_err_str(ret));
++	if (ret)
++		goto out;
++
++	bch2_journal_pin_drop(j, &ck->journal);
++	bch2_journal_preres_put(j, &ck->res);
++
++	BUG_ON(!btree_node_locked(c_iter.path, 0));
++
++	if (!evict) {
++		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
++			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
++			atomic_long_dec(&c->btree_key_cache.nr_dirty);
++		}
++	} else {
++evict:
++		BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
++
++		mark_btree_node_unlocked(c_iter.path, 0);
++		c_iter.path->l[0].b = NULL;
++
++		six_lock_write(&ck->c.lock, NULL, NULL);
++
++		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
++			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
++			atomic_long_dec(&c->btree_key_cache.nr_dirty);
++		}
++
++		bkey_cached_evict(&c->btree_key_cache, ck);
++
++		bkey_cached_free_fast(&c->btree_key_cache, ck);
++	}
++out:
++	bch2_trans_iter_exit(trans, &b_iter);
++	bch2_trans_iter_exit(trans, &c_iter);
++	return ret;
++}
++
++int bch2_btree_key_cache_journal_flush(struct journal *j,
++				struct journal_entry_pin *pin, u64 seq)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct bkey_cached *ck =
++		container_of(pin, struct bkey_cached, journal);
++	struct bkey_cached_key key;
++	int ret = 0;
++
++	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
++
++	six_lock_read(&ck->c.lock, NULL, NULL);
++	key = ck->key;
++
++	if (ck->journal.seq != seq ||
++	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
++		six_unlock_read(&ck->c.lock);
++		goto unlock;
++	}
++	six_unlock_read(&ck->c.lock);
++
++	ret = bch2_trans_do(c, NULL, NULL, 0,
++		btree_key_cache_flush_pos(&trans, key, seq,
++				BTREE_INSERT_JOURNAL_RECLAIM, false));
++unlock:
++	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
++
++	return ret;
++}
++
++/*
++ * Flush and evict a key from the key cache:
++ */
++int bch2_btree_key_cache_flush(struct btree_trans *trans,
++			       enum btree_id id, struct bpos pos)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_cached_key key = { id, pos };
++
++	/* Fastpath - assume it won't be found: */
++	if (!bch2_btree_key_cache_find(c, id, pos))
++		return 0;
++
++	return btree_key_cache_flush_pos(trans, key, 0, 0, true);
++}
++
++bool bch2_btree_insert_key_cached(struct btree_trans *trans,
++				  struct btree_path *path,
++				  struct bkey_i *insert)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_cached *ck = (void *) path->l[0].b;
++	bool kick_reclaim = false;
++
++	BUG_ON(insert->u64s > ck->u64s);
++
++	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
++		int difference;
++
++		BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
++
++		difference = jset_u64s(insert->u64s) - ck->res.u64s;
++		if (difference > 0) {
++			trans->journal_preres.u64s	-= difference;
++			ck->res.u64s			+= difference;
++		}
++	}
++
++	bkey_copy(ck->k, insert);
++	ck->valid = true;
++
++	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
++		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
++		atomic_long_inc(&c->btree_key_cache.nr_dirty);
++
++		if (bch2_nr_btree_keys_need_flush(c))
++			kick_reclaim = true;
++	}
++
++	bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
++				&ck->journal, bch2_btree_key_cache_journal_flush);
++
++	if (kick_reclaim)
++		journal_reclaim_kick(&c->journal);
++	return true;
++}
++
++void bch2_btree_key_cache_drop(struct btree_trans *trans,
++			       struct btree_path *path)
++{
++	struct bkey_cached *ck = (void *) path->l[0].b;
++
++	ck->valid = false;
++
++	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
++}
++
++static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
++					   struct shrink_control *sc)
++{
++	struct bch_fs *c = container_of(shrink, struct bch_fs,
++					btree_key_cache.shrink);
++	struct btree_key_cache *bc = &c->btree_key_cache;
++	struct bucket_table *tbl;
++	struct bkey_cached *ck, *t;
++	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
++	unsigned start, flags;
++	int srcu_idx;
++
++	/* Return -1 if we can't do anything right now */
++	if (sc->gfp_mask & __GFP_FS)
++		mutex_lock(&bc->lock);
++	else if (!mutex_trylock(&bc->lock))
++		return -1;
++
++	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
++	flags = memalloc_nofs_save();
++
++	/*
++	 * Newest freed entries are at the end of the list - once we hit one
++	 * that's too new to be freed, we can bail out:
++	 */
++	list_for_each_entry_safe(ck, t, &bc->freed, list) {
++		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
++						 ck->btree_trans_barrier_seq))
++			break;
++
++		list_del(&ck->list);
++		kmem_cache_free(bch2_key_cache, ck);
++		atomic_long_dec(&bc->nr_freed);
++		scanned++;
++		freed++;
++	}
++
++	if (scanned >= nr)
++		goto out;
++
++	rcu_read_lock();
++	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
++	if (bc->shrink_iter >= tbl->size)
++		bc->shrink_iter = 0;
++	start = bc->shrink_iter;
++
++	do {
++		struct rhash_head *pos, *next;
++
++		pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
++
++		while (!rht_is_a_nulls(pos)) {
++			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
++			ck = container_of(pos, struct bkey_cached, hash);
++
++			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
++				goto next;
++
++			if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
++				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
++			else if (bkey_cached_lock_for_evict(ck)) {
++				bkey_cached_evict(bc, ck);
++				bkey_cached_free(bc, ck);
++			}
++
++			scanned++;
++			if (scanned >= nr)
++				break;
++next:
++			pos = next;
++		}
++
++		bc->shrink_iter++;
++		if (bc->shrink_iter >= tbl->size)
++			bc->shrink_iter = 0;
++	} while (scanned < nr && bc->shrink_iter != start);
++
++	rcu_read_unlock();
++out:
++	memalloc_nofs_restore(flags);
++	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
++	mutex_unlock(&bc->lock);
++
++	return freed;
++}
++
++static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
++					    struct shrink_control *sc)
++{
++	struct bch_fs *c = container_of(shrink, struct bch_fs,
++					btree_key_cache.shrink);
++	struct btree_key_cache *bc = &c->btree_key_cache;
++	long nr = atomic_long_read(&bc->nr_keys) -
++		atomic_long_read(&bc->nr_dirty);
++
++	return max(0L, nr);
++}
++
++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
++{
++	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
++	struct bucket_table *tbl;
++	struct bkey_cached *ck, *n;
++	struct rhash_head *pos;
++	unsigned i;
++	int cpu;
++
++	if (bc->shrink.list.next)
++		unregister_shrinker(&bc->shrink);
++
++	mutex_lock(&bc->lock);
++
++	rcu_read_lock();
++	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
++	if (tbl)
++		for (i = 0; i < tbl->size; i++)
++			rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
++				bkey_cached_evict(bc, ck);
++				list_add(&ck->list, &bc->freed);
++			}
++	rcu_read_unlock();
++
++	for_each_possible_cpu(cpu) {
++		struct btree_key_cache_freelist *f =
++			per_cpu_ptr(bc->pcpu_freed, cpu);
++
++		for (i = 0; i < f->nr; i++) {
++			ck = f->objs[i];
++			list_add(&ck->list, &bc->freed);
++		}
++	}
++
++	list_for_each_entry_safe(ck, n, &bc->freed, list) {
++		cond_resched();
++
++		bch2_journal_pin_drop(&c->journal, &ck->journal);
++		bch2_journal_preres_put(&c->journal, &ck->res);
++
++		list_del(&ck->list);
++		kfree(ck->k);
++		kmem_cache_free(bch2_key_cache, ck);
++	}
++
++	BUG_ON(atomic_long_read(&bc->nr_dirty) &&
++	       !bch2_journal_error(&c->journal) &&
++	       test_bit(BCH_FS_WAS_RW, &c->flags));
++	BUG_ON(atomic_long_read(&bc->nr_keys));
++
++	mutex_unlock(&bc->lock);
++
++	if (bc->table_init_done)
++		rhashtable_destroy(&bc->table);
++
++	free_percpu(bc->pcpu_freed);
++}
++
++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
++{
++	mutex_init(&c->lock);
++	INIT_LIST_HEAD(&c->freed);
++}
++
++static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
++{
++	struct btree_key_cache *bc =
++		container_of(shrink, struct btree_key_cache, shrink);
++
++	bch2_btree_key_cache_to_text(out, bc);
++}
++
++int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
++{
++	int ret;
++
++	c->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
++	if (!c->pcpu_freed)
++		return -ENOMEM;
++
++	ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
++	if (ret)
++		return ret;
++
++	c->table_init_done = true;
++
++	c->shrink.seeks			= 1;
++	c->shrink.count_objects		= bch2_btree_key_cache_count;
++	c->shrink.scan_objects		= bch2_btree_key_cache_scan;
++	c->shrink.to_text		= bch2_btree_key_cache_shrinker_to_text;
++	return register_shrinker(&c->shrink);
++}
++
++void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
++{
++	prt_printf(out, "nr_freed:\t%zu\n",	atomic_long_read(&c->nr_freed));
++	prt_printf(out, "nr_keys:\t%lu\n",	atomic_long_read(&c->nr_keys));
++	prt_printf(out, "nr_dirty:\t%lu\n",	atomic_long_read(&c->nr_dirty));
++}
++
++void bch2_btree_key_cache_exit(void)
++{
++	if (bch2_key_cache)
++		kmem_cache_destroy(bch2_key_cache);
++}
++
++int __init bch2_btree_key_cache_init(void)
++{
++	bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
++	if (!bch2_key_cache)
++		return -ENOMEM;
++
++	return 0;
++}
+diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
+new file mode 100644
+index 000000000000..670746e72dab
+--- /dev/null
++++ b/fs/bcachefs/btree_key_cache.h
+@@ -0,0 +1,47 @@
++#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
++#define _BCACHEFS_BTREE_KEY_CACHE_H
++
++static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
++{
++	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
++	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
++	size_t max_dirty = 1024 + nr_keys  / 2;
++
++	return max_t(ssize_t, 0, nr_dirty - max_dirty);
++}
++
++static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
++{
++	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
++	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
++	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
++
++	return nr_dirty > max_dirty;
++}
++
++int bch2_btree_key_cache_journal_flush(struct journal *,
++				struct journal_entry_pin *, u64);
++
++struct bkey_cached *
++bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
++
++int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
++				    unsigned);
++
++bool bch2_btree_insert_key_cached(struct btree_trans *,
++			struct btree_path *, struct bkey_i *);
++int bch2_btree_key_cache_flush(struct btree_trans *,
++			       enum btree_id, struct bpos);
++void bch2_btree_key_cache_drop(struct btree_trans *,
++			       struct btree_path *);
++
++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
++int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
++
++void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
++
++void bch2_btree_key_cache_exit(void);
++int __init bch2_btree_key_cache_init(void);
++
++#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
+diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
+new file mode 100644
+index 000000000000..49eef650e436
+--- /dev/null
++++ b/fs/bcachefs/btree_locking.h
+@@ -0,0 +1,289 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BTREE_LOCKING_H
++#define _BCACHEFS_BTREE_LOCKING_H
++
++/*
++ * Only for internal btree use:
++ *
++ * The btree iterator tracks what locks it wants to take, and what locks it
++ * currently has - here we have wrappers for locking/unlocking btree nodes and
++ * updating the iterator state
++ */
++
++#include <linux/six.h>
++
++#include "btree_iter.h"
++
++/* matches six lock types */
++enum btree_node_locked_type {
++	BTREE_NODE_UNLOCKED		= -1,
++	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
++	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
++};
++
++static inline int btree_node_locked_type(struct btree_path *path,
++					 unsigned level)
++{
++	/*
++	 * We're relying on the fact that if nodes_intent_locked is set
++	 * nodes_locked must be set as well, so that we can compute without
++	 * branches:
++	 */
++	return BTREE_NODE_UNLOCKED +
++		((path->nodes_locked >> level) & 1) +
++		((path->nodes_intent_locked >> level) & 1);
++}
++
++static inline bool btree_node_intent_locked(struct btree_path *path,
++					    unsigned level)
++{
++	return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
++}
++
++static inline bool btree_node_read_locked(struct btree_path *path,
++					  unsigned level)
++{
++	return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
++}
++
++static inline bool btree_node_locked(struct btree_path *path, unsigned level)
++{
++	return path->nodes_locked & (1 << level);
++}
++
++static inline void mark_btree_node_unlocked(struct btree_path *path,
++					    unsigned level)
++{
++	path->nodes_locked &= ~(1 << level);
++	path->nodes_intent_locked &= ~(1 << level);
++}
++
++static inline void mark_btree_node_locked_noreset(struct btree_trans *trans,
++					  struct btree_path *path,
++					  unsigned level,
++					  enum six_lock_type type)
++{
++	/* relying on this to avoid a branch */
++	BUILD_BUG_ON(SIX_LOCK_read   != 0);
++	BUILD_BUG_ON(SIX_LOCK_intent != 1);
++
++	BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx);
++
++	path->nodes_locked |= 1 << level;
++	path->nodes_intent_locked |= type << level;
++}
++
++static inline void mark_btree_node_locked(struct btree_trans *trans,
++					  struct btree_path *path,
++					  unsigned level,
++					  enum six_lock_type type)
++{
++	mark_btree_node_locked_noreset(trans, path, level, type);
++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
++	path->l[level].lock_taken_time = ktime_get_ns();
++#endif
++}
++
++static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
++						 struct btree_path *path,
++						 unsigned level)
++{
++	mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent);
++}
++
++static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
++{
++	return level < path->locks_want
++		? SIX_LOCK_intent
++		: SIX_LOCK_read;
++}
++
++static inline enum btree_node_locked_type
++btree_lock_want(struct btree_path *path, int level)
++{
++	if (level < path->level)
++		return BTREE_NODE_UNLOCKED;
++	if (level < path->locks_want)
++		return BTREE_NODE_INTENT_LOCKED;
++	if (level == path->level)
++		return BTREE_NODE_READ_LOCKED;
++	return BTREE_NODE_UNLOCKED;
++}
++
++static inline void btree_node_unlock(struct btree_trans *trans,
++				     struct btree_path *path, unsigned level)
++{
++	int lock_type = btree_node_locked_type(path, level);
++
++	EBUG_ON(level >= BTREE_MAX_DEPTH);
++
++	if (lock_type != BTREE_NODE_UNLOCKED) {
++		six_unlock_type(&path->l[level].b->c.lock, lock_type);
++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
++		if (trans->lock_name_idx < BCH_LOCK_TIME_NR) {
++			struct bch_fs *c = trans->c;
++
++			__bch2_time_stats_update(&c->lock_held_stats.times[trans->lock_name_idx],
++					       path->l[level].lock_taken_time,
++						 ktime_get_ns());
++		}
++#endif
++	}
++	mark_btree_node_unlocked(path, level);
++}
++
++static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
++					    struct btree_path *path)
++{
++	btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
++
++	while (path->nodes_locked)
++		btree_node_unlock(trans, path, __ffs(path->nodes_locked));
++}
++
++static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
++{
++	switch (type) {
++	case SIX_LOCK_read:
++		return BCH_TIME_btree_lock_contended_read;
++	case SIX_LOCK_intent:
++		return BCH_TIME_btree_lock_contended_intent;
++	case SIX_LOCK_write:
++		return BCH_TIME_btree_lock_contended_write;
++	default:
++		BUG();
++	}
++}
++
++static inline int btree_node_lock_type(struct btree_trans *trans,
++				       struct btree_path *path,
++				       struct btree *b,
++				       struct bpos pos, unsigned level,
++				       enum six_lock_type type,
++				       six_lock_should_sleep_fn should_sleep_fn, void *p)
++{
++	struct bch_fs *c = trans->c;
++	u64 start_time;
++	int ret;
++
++	if (six_trylock_type(&b->c.lock, type))
++		return 0;
++
++	start_time = local_clock();
++
++	trans->locking_path_idx = path->idx;
++	trans->locking_pos	= pos;
++	trans->locking_btree_id	= path->btree_id;
++	trans->locking_level	= level;
++	trans->locking_lock_type = type;
++	trans->locking		= b;
++	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p);
++	trans->locking = NULL;
++
++	if (ret)
++		return ret;
++
++	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
++	return 0;
++}
++
++/*
++ * Lock a btree node if we already have it locked on one of our linked
++ * iterators:
++ */
++static inline bool btree_node_lock_increment(struct btree_trans *trans,
++					     struct btree *b, unsigned level,
++					     enum btree_node_locked_type want)
++{
++	struct btree_path *path;
++
++	trans_for_each_path(trans, path)
++		if (path->l[level].b == b &&
++		    btree_node_locked_type(path, level) >= want) {
++			six_lock_increment(&b->c.lock, want);
++			return true;
++		}
++
++	return false;
++}
++
++int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
++			   struct btree *, struct bpos, unsigned,
++			   enum six_lock_type,
++			   six_lock_should_sleep_fn, void *,
++			   unsigned long);
++
++static inline int btree_node_lock(struct btree_trans *trans,
++			struct btree_path *path,
++			struct btree *b, struct bpos pos, unsigned level,
++			enum six_lock_type type,
++			six_lock_should_sleep_fn should_sleep_fn, void *p,
++			unsigned long ip)
++{
++	int ret = 0;
++
++	EBUG_ON(level >= BTREE_MAX_DEPTH);
++	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
++
++	if (likely(six_trylock_type(&b->c.lock, type)) ||
++	    btree_node_lock_increment(trans, b, level, type) ||
++	    !(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type,
++					   should_sleep_fn, p, ip))) {
++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
++		path->l[b->c.level].lock_taken_time = ktime_get_ns();
++#endif
++	}
++
++	return ret;
++}
++
++bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
++
++static inline bool bch2_btree_node_relock(struct btree_trans *trans,
++					  struct btree_path *path, unsigned level)
++{
++	EBUG_ON(btree_node_locked(path, level) &&
++		btree_node_locked_type(path, level) !=
++		__btree_lock_want(path, level));
++
++	return likely(btree_node_locked(path, level)) ||
++		__bch2_btree_node_relock(trans, path, level);
++}
++
++/*
++ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
++ * succeed:
++ */
++static inline void
++bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
++				     struct btree *b)
++{
++	struct btree_path *linked;
++
++	EBUG_ON(path->l[b->c.level].b != b);
++	EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
++
++	trans_for_each_path_with_node(trans, b, linked)
++		linked->l[b->c.level].lock_seq += 2;
++
++	six_unlock_write(&b->c.lock);
++}
++
++void bch2_btree_node_unlock_write(struct btree_trans *,
++			struct btree_path *, struct btree *);
++
++void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
++
++static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
++					      struct btree_path *path,
++					      struct btree *b)
++{
++	EBUG_ON(path->l[b->c.level].b != b);
++	EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
++	EBUG_ON(!btree_node_intent_locked(path, b->c.level));
++
++	if (unlikely(!six_trylock_write(&b->c.lock)))
++		__bch2_btree_node_lock_write(trans, b);
++}
++
++#endif /* _BCACHEFS_BTREE_LOCKING_H */
+diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
+new file mode 100644
+index 000000000000..a2826dfe13cb
+--- /dev/null
++++ b/fs/bcachefs/btree_types.h
+@@ -0,0 +1,697 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BTREE_TYPES_H
++#define _BCACHEFS_BTREE_TYPES_H
++
++#include <linux/list.h>
++#include <linux/rhashtable.h>
++#include <linux/six.h>
++
++#include "bkey_methods.h"
++#include "buckets_types.h"
++#include "darray.h"
++#include "journal_types.h"
++
++struct open_bucket;
++struct btree_update;
++struct btree_trans;
++
++#define MAX_BSETS		3U
++
++struct btree_nr_keys {
++
++	/*
++	 * Amount of live metadata (i.e. size of node after a compaction) in
++	 * units of u64s
++	 */
++	u16			live_u64s;
++	u16			bset_u64s[MAX_BSETS];
++
++	/* live keys only: */
++	u16			packed_keys;
++	u16			unpacked_keys;
++};
++
++struct bset_tree {
++	/*
++	 * We construct a binary tree in an array as if the array
++	 * started at 1, so that things line up on the same cachelines
++	 * better: see comments in bset.c at cacheline_to_bkey() for
++	 * details
++	 */
++
++	/* size of the binary tree and prev array */
++	u16			size;
++
++	/* function of size - precalculated for to_inorder() */
++	u16			extra;
++
++	u16			data_offset;
++	u16			aux_data_offset;
++	u16			end_offset;
++};
++
++struct btree_write {
++	struct journal_entry_pin	journal;
++};
++
++struct btree_alloc {
++	struct open_buckets	ob;
++	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
++};
++
++struct btree_bkey_cached_common {
++	struct six_lock		lock;
++	u8			level;
++	u8			btree_id;
++};
++
++struct btree {
++	struct btree_bkey_cached_common c;
++
++	struct rhash_head	hash;
++	u64			hash_val;
++
++	unsigned long		flags;
++	u16			written;
++	u8			nsets;
++	u8			nr_key_bits;
++	u16			version_ondisk;
++
++	struct bkey_format	format;
++
++	struct btree_node	*data;
++	void			*aux_data;
++
++	/*
++	 * Sets of sorted keys - the real btree node - plus a binary search tree
++	 *
++	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
++	 * to the memory we have allocated for this btree node. Additionally,
++	 * set[0]->data points to the entire btree node as it exists on disk.
++	 */
++	struct bset_tree	set[MAX_BSETS];
++
++	struct btree_nr_keys	nr;
++	u16			sib_u64s[2];
++	u16			whiteout_u64s;
++	u8			byte_order;
++	u8			unpack_fn_len;
++
++	struct btree_write	writes[2];
++
++	/* Key/pointer for this btree node */
++	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
++
++	/*
++	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
++	 * fails because the lock sequence number has changed - i.e. the
++	 * contents were modified - we can still relock the node if it's still
++	 * the one we want, without redoing the traversal
++	 */
++
++	/*
++	 * For asynchronous splits/interior node updates:
++	 * When we do a split, we allocate new child nodes and update the parent
++	 * node to point to them: we update the parent in memory immediately,
++	 * but then we must wait until the children have been written out before
++	 * the update to the parent can be written - this is a list of the
++	 * btree_updates that are blocking this node from being
++	 * written:
++	 */
++	struct list_head	write_blocked;
++
++	/*
++	 * Also for asynchronous splits/interior node updates:
++	 * If a btree node isn't reachable yet, we don't want to kick off
++	 * another write - because that write also won't yet be reachable and
++	 * marking it as completed before it's reachable would be incorrect:
++	 */
++	unsigned long		will_make_reachable;
++
++	struct open_buckets	ob;
++
++	/* lru list */
++	struct list_head	list;
++};
++
++struct btree_cache {
++	struct rhashtable	table;
++	bool			table_init_done;
++	/*
++	 * We never free a struct btree, except on shutdown - we just put it on
++	 * the btree_cache_freed list and reuse it later. This simplifies the
++	 * code, and it doesn't cost us much memory as the memory usage is
++	 * dominated by buffers that hold the actual btree node data and those
++	 * can be freed - and the number of struct btrees allocated is
++	 * effectively bounded.
++	 *
++	 * btree_cache_freeable effectively is a small cache - we use it because
++	 * high order page allocations can be rather expensive, and it's quite
++	 * common to delete and allocate btree nodes in quick succession. It
++	 * should never grow past ~2-3 nodes in practice.
++	 */
++	struct mutex		lock;
++	struct list_head	live;
++	struct list_head	freeable;
++	struct list_head	freed_pcpu;
++	struct list_head	freed_nonpcpu;
++
++	/* Number of elements in live + freeable lists */
++	unsigned		used;
++	unsigned		reserve;
++	atomic_t		dirty;
++	struct shrinker		shrink;
++
++	/*
++	 * If we need to allocate memory for a new btree node and that
++	 * allocation fails, we can cannibalize another node in the btree cache
++	 * to satisfy the allocation - lock to guarantee only one thread does
++	 * this at a time:
++	 */
++	struct task_struct	*alloc_lock;
++	struct closure_waitlist	alloc_wait;
++};
++
++struct btree_node_iter {
++	struct btree_node_iter_set {
++		u16	k, end;
++	} data[MAX_BSETS];
++};
++
++/*
++ * Iterate over all possible positions, synthesizing deleted keys for holes:
++ */
++#define BTREE_ITER_SLOTS		(1 << 0)
++#define BTREE_ITER_ALL_LEVELS		(1 << 1)
++/*
++ * Indicates that intent locks should be taken on leaf nodes, because we expect
++ * to be doing updates:
++ */
++#define BTREE_ITER_INTENT		(1 << 2)
++/*
++ * Causes the btree iterator code to prefetch additional btree nodes from disk:
++ */
++#define BTREE_ITER_PREFETCH		(1 << 3)
++/*
++ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
++ * @pos or the first key strictly greater than @pos
++ */
++#define BTREE_ITER_IS_EXTENTS		(1 << 4)
++#define BTREE_ITER_NOT_EXTENTS		(1 << 5)
++#define BTREE_ITER_CACHED		(1 << 6)
++#define BTREE_ITER_CACHED_NOFILL	(1 << 7)
++#define BTREE_ITER_CACHED_NOCREATE	(1 << 8)
++#define BTREE_ITER_WITH_KEY_CACHE	(1 << 9)
++#define BTREE_ITER_WITH_UPDATES		(1 << 10)
++#define BTREE_ITER_WITH_JOURNAL		(1 << 11)
++#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
++#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 13)
++#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 14)
++#define BTREE_ITER_NOPRESERVE		(1 << 15)
++
++enum btree_path_uptodate {
++	BTREE_ITER_UPTODATE		= 0,
++	BTREE_ITER_NEED_RELOCK		= 1,
++	BTREE_ITER_NEED_TRAVERSE	= 2,
++};
++
++#define BTREE_ITER_NO_NODE_GET_LOCKS	((struct btree *) 1)
++#define BTREE_ITER_NO_NODE_DROP		((struct btree *) 2)
++#define BTREE_ITER_NO_NODE_LOCK_ROOT	((struct btree *) 3)
++#define BTREE_ITER_NO_NODE_UP		((struct btree *) 4)
++#define BTREE_ITER_NO_NODE_DOWN		((struct btree *) 5)
++#define BTREE_ITER_NO_NODE_INIT		((struct btree *) 6)
++#define BTREE_ITER_NO_NODE_ERROR	((struct btree *) 7)
++#define BTREE_ITER_NO_NODE_CACHED	((struct btree *) 8)
++
++struct btree_path {
++	u8			idx;
++	u8			sorted_idx;
++	u8			ref;
++	u8			intent_ref;
++
++	/* btree_iter_copy starts here: */
++	struct bpos		pos;
++
++	enum btree_id		btree_id:4;
++	bool			cached:1;
++	bool			preserve:1;
++	enum btree_path_uptodate uptodate:2;
++	/*
++	 * When true, failing to relock this path will cause the transaction to
++	 * restart:
++	 */
++	bool			should_be_locked:1;
++	unsigned		level:3,
++				locks_want:4,
++				nodes_locked:4,
++				nodes_intent_locked:4;
++
++	struct btree_path_level {
++		struct btree	*b;
++		struct btree_node_iter iter;
++		u32		lock_seq;
++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
++		u64             lock_taken_time;
++#endif
++	}			l[BTREE_MAX_DEPTH];
++#ifdef CONFIG_BCACHEFS_DEBUG
++	unsigned long		ip_allocated;
++#endif
++};
++
++static inline struct btree_path_level *path_l(struct btree_path *path)
++{
++	return path->l + path->level;
++}
++
++/*
++ * @pos			- iterator's current position
++ * @level		- current btree depth
++ * @locks_want		- btree level below which we start taking intent locks
++ * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
++ * @nodes_intent_locked	- bitmask indicating which locks are intent locks
++ */
++struct btree_iter {
++	struct btree_trans	*trans;
++	struct btree_path	*path;
++	struct btree_path	*update_path;
++	struct btree_path	*key_cache_path;
++
++	enum btree_id		btree_id:4;
++	unsigned		min_depth:3;
++	unsigned		advanced:1;
++
++	/* btree_iter_copy starts here: */
++	u16			flags;
++
++	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
++	unsigned		snapshot;
++
++	struct bpos		pos;
++	struct bpos		pos_after_commit;
++	/*
++	 * Current unpacked key - so that bch2_btree_iter_next()/
++	 * bch2_btree_iter_next_slot() can correctly advance pos.
++	 */
++	struct bkey		k;
++
++	/* BTREE_ITER_WITH_JOURNAL: */
++	size_t			journal_idx;
++	struct bpos		journal_pos;
++#ifdef CONFIG_BCACHEFS_DEBUG
++	unsigned long		ip_allocated;
++#endif
++};
++
++struct btree_key_cache_freelist {
++	struct bkey_cached	*objs[16];
++	unsigned		nr;
++};
++
++struct btree_key_cache {
++	struct mutex		lock;
++	struct rhashtable	table;
++	bool			table_init_done;
++	struct list_head	freed;
++	struct shrinker		shrink;
++	unsigned		shrink_iter;
++	struct btree_key_cache_freelist __percpu *pcpu_freed;
++
++	atomic_long_t		nr_freed;
++	atomic_long_t		nr_keys;
++	atomic_long_t		nr_dirty;
++};
++
++struct bkey_cached_key {
++	u32			btree_id;
++	struct bpos		pos;
++} __attribute__((packed, aligned(4)));
++
++#define BKEY_CACHED_ACCESSED		0
++#define BKEY_CACHED_DIRTY		1
++
++struct bkey_cached {
++	struct btree_bkey_cached_common c;
++
++	unsigned long		flags;
++	u16			u64s;
++	bool			valid;
++	u32			btree_trans_barrier_seq;
++	struct bkey_cached_key	key;
++
++	struct rhash_head	hash;
++	struct list_head	list;
++
++	struct journal_preres	res;
++	struct journal_entry_pin journal;
++
++	struct bkey_i		*k;
++};
++
++struct btree_insert_entry {
++	unsigned		flags;
++	u8			bkey_type;
++	enum btree_id		btree_id:8;
++	u8			level:4;
++	bool			cached:1;
++	bool			insert_trigger_run:1;
++	bool			overwrite_trigger_run:1;
++	bool			key_cache_already_flushed:1;
++	/*
++	 * @old_k may be a key from the journal; @old_btree_u64s always refers
++	 * to the size of the key being overwritten in the btree:
++	 */
++	u8			old_btree_u64s;
++	struct bkey_i		*k;
++	struct btree_path	*path;
++	/* key being overwritten: */
++	struct bkey		old_k;
++	const struct bch_val	*old_v;
++	unsigned long		ip_allocated;
++};
++
++#ifndef CONFIG_LOCKDEP
++#define BTREE_ITER_MAX		64
++#else
++#define BTREE_ITER_MAX		32
++#endif
++
++struct btree_trans_commit_hook;
++typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
++
++struct btree_trans_commit_hook {
++	btree_trans_commit_hook_fn	*fn;
++	struct btree_trans_commit_hook	*next;
++};
++
++#define BTREE_TRANS_MEM_MAX	(1U << 16)
++
++#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS	10000
++
++struct btree_trans {
++	struct bch_fs		*c;
++	const char		*fn;
++	struct list_head	list;
++	u64			last_begin_time;
++	struct btree		*locking;
++	unsigned		locking_path_idx;
++	struct bpos		locking_pos;
++	u8			locking_btree_id;
++	u8			locking_level;
++	u8			locking_lock_type;
++	struct task_struct	*task;
++	int			srcu_idx;
++
++	u8			nr_sorted;
++	u8			nr_updates;
++	u8			traverse_all_idx;
++	bool			used_mempool:1;
++	bool			in_traverse_all:1;
++	bool			memory_allocation_failure:1;
++	bool			is_initial_gc:1;
++	enum bch_errcode	restarted:16;
++	u32			restart_count;
++	unsigned long		last_restarted_ip;
++
++	/*
++	 * For when bch2_trans_update notices we'll be splitting a compressed
++	 * extent:
++	 */
++	unsigned		extra_journal_res;
++
++	u64			paths_allocated;
++
++	unsigned		mem_top;
++	unsigned		mem_bytes;
++	void			*mem;
++
++	u8			sorted[BTREE_ITER_MAX];
++	struct btree_path	*paths;
++	struct btree_insert_entry *updates;
++
++	/* update path: */
++	struct btree_trans_commit_hook *hooks;
++	DARRAY(u64)		extra_journal_entries;
++	struct journal_entry_pin *journal_pin;
++
++	struct journal_res	journal_res;
++	struct journal_preres	journal_preres;
++	u64			*journal_seq;
++	struct disk_reservation *disk_res;
++	unsigned		flags;
++	unsigned		journal_u64s;
++	unsigned		journal_preres_u64s;
++	struct replicas_delta_list *fs_usage_deltas;
++	int                      lock_name_idx;
++};
++
++#define BTREE_FLAGS()							\
++	x(read_in_flight)						\
++	x(read_error)							\
++	x(dirty)							\
++	x(need_write)							\
++	x(write_blocked)						\
++	x(will_make_reachable)						\
++	x(noevict)							\
++	x(write_idx)							\
++	x(accessed)							\
++	x(write_in_flight)						\
++	x(write_in_flight_inner)					\
++	x(just_written)							\
++	x(dying)							\
++	x(fake)								\
++	x(need_rewrite)							\
++	x(never_write)
++
++enum btree_flags {
++#define x(flag)	BTREE_NODE_##flag,
++	BTREE_FLAGS()
++#undef x
++};
++
++#define x(flag)								\
++static inline bool btree_node_ ## flag(struct btree *b)			\
++{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
++									\
++static inline void set_btree_node_ ## flag(struct btree *b)		\
++{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
++									\
++static inline void clear_btree_node_ ## flag(struct btree *b)		\
++{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
++
++BTREE_FLAGS()
++#undef x
++
++static inline struct btree_write *btree_current_write(struct btree *b)
++{
++	return b->writes + btree_node_write_idx(b);
++}
++
++static inline struct btree_write *btree_prev_write(struct btree *b)
++{
++	return b->writes + (btree_node_write_idx(b) ^ 1);
++}
++
++static inline struct bset_tree *bset_tree_last(struct btree *b)
++{
++	EBUG_ON(!b->nsets);
++	return b->set + b->nsets - 1;
++}
++
++static inline void *
++__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
++{
++	return (void *) ((u64 *) b->data + 1 + offset);
++}
++
++static inline u16
++__btree_node_ptr_to_offset(const struct btree *b, const void *p)
++{
++	u16 ret = (u64 *) p - 1 - (u64 *) b->data;
++
++	EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
++	return ret;
++}
++
++static inline struct bset *bset(const struct btree *b,
++				const struct bset_tree *t)
++{
++	return __btree_node_offset_to_ptr(b, t->data_offset);
++}
++
++static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
++{
++	t->end_offset =
++		__btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
++}
++
++static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
++				  const struct bset *i)
++{
++	t->data_offset = __btree_node_ptr_to_offset(b, i);
++	set_btree_bset_end(b, t);
++}
++
++static inline struct bset *btree_bset_first(struct btree *b)
++{
++	return bset(b, b->set);
++}
++
++static inline struct bset *btree_bset_last(struct btree *b)
++{
++	return bset(b, bset_tree_last(b));
++}
++
++static inline u16
++__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
++{
++	return __btree_node_ptr_to_offset(b, k);
++}
++
++static inline struct bkey_packed *
++__btree_node_offset_to_key(const struct btree *b, u16 k)
++{
++	return __btree_node_offset_to_ptr(b, k);
++}
++
++static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
++{
++	return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
++}
++
++#define btree_bkey_first(_b, _t)					\
++({									\
++	EBUG_ON(bset(_b, _t)->start !=					\
++		__btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
++									\
++	bset(_b, _t)->start;						\
++})
++
++#define btree_bkey_last(_b, _t)						\
++({									\
++	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
++		vstruct_last(bset(_b, _t)));				\
++									\
++	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
++})
++
++static inline unsigned bset_u64s(struct bset_tree *t)
++{
++	return t->end_offset - t->data_offset -
++		sizeof(struct bset) / sizeof(u64);
++}
++
++static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
++{
++	return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
++}
++
++static inline unsigned bset_byte_offset(struct btree *b, void *i)
++{
++	return i - (void *) b->data;
++}
++
++enum btree_node_type {
++#define x(kwd, val) BKEY_TYPE_##kwd = val,
++	BCH_BTREE_IDS()
++#undef x
++	BKEY_TYPE_btree,
++};
++
++/* Type of a key in btree @id at level @level: */
++static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
++{
++	return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
++}
++
++/* Type of keys @b contains: */
++static inline enum btree_node_type btree_node_type(struct btree *b)
++{
++	return __btree_node_type(b->c.level, b->c.btree_id);
++}
++
++#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
++	((1U << BKEY_TYPE_extents)|			\
++	 (1U << BKEY_TYPE_alloc)|			\
++	 (1U << BKEY_TYPE_inodes)|			\
++	 (1U << BKEY_TYPE_stripes)|			\
++	 (1U << BKEY_TYPE_reflink)|			\
++	 (1U << BKEY_TYPE_btree))
++
++#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS		\
++	((1U << BKEY_TYPE_alloc)|			\
++	 (1U << BKEY_TYPE_inodes)|			\
++	 (1U << BKEY_TYPE_stripes)|			\
++	 (1U << BKEY_TYPE_snapshots))
++
++#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
++	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
++	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
++
++#define BTREE_ID_IS_EXTENTS				\
++	((1U << BTREE_ID_extents)|			\
++	 (1U << BTREE_ID_reflink)|			\
++	 (1U << BTREE_ID_freespace))
++
++static inline bool btree_node_type_is_extents(enum btree_node_type type)
++{
++	return (1U << type) & BTREE_ID_IS_EXTENTS;
++}
++
++#define BTREE_ID_HAS_SNAPSHOTS				\
++	((1U << BTREE_ID_extents)|			\
++	 (1U << BTREE_ID_inodes)|			\
++	 (1U << BTREE_ID_dirents)|			\
++	 (1U << BTREE_ID_xattrs))
++
++#define BTREE_ID_HAS_PTRS				\
++	((1U << BTREE_ID_extents)|			\
++	 (1U << BTREE_ID_reflink))
++
++static inline bool btree_type_has_snapshots(enum btree_id id)
++{
++	return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
++}
++
++static inline bool btree_type_has_ptrs(enum btree_id id)
++{
++	return (1 << id) & BTREE_ID_HAS_PTRS;
++}
++
++static inline bool btree_node_type_needs_gc(enum btree_node_type type)
++{
++	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
++}
++
++struct btree_root {
++	struct btree		*b;
++
++	/* On disk root - see async splits: */
++	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
++	u8			level;
++	u8			alive;
++	s8			error;
++};
++
++enum btree_insert_ret {
++	BTREE_INSERT_OK,
++	/* leaf node needs to be split */
++	BTREE_INSERT_BTREE_NODE_FULL,
++	BTREE_INSERT_NEED_MARK_REPLICAS,
++	BTREE_INSERT_NEED_JOURNAL_RES,
++	BTREE_INSERT_NEED_JOURNAL_RECLAIM,
++};
++
++enum btree_gc_coalesce_fail_reason {
++	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
++	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
++	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
++};
++
++enum btree_node_sibling {
++	btree_prev_sib,
++	btree_next_sib,
++};
++
++#endif /* _BCACHEFS_BTREE_TYPES_H */
+diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
+new file mode 100644
+index 000000000000..89941fb8caa0
+--- /dev/null
++++ b/fs/bcachefs/btree_update.h
+@@ -0,0 +1,158 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BTREE_UPDATE_H
++#define _BCACHEFS_BTREE_UPDATE_H
++
++#include "btree_iter.h"
++#include "journal.h"
++
++struct bch_fs;
++struct btree;
++
++void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
++				     struct btree *);
++bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
++				struct btree *, struct btree_node_iter *,
++				struct bkey_i *);
++void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
++
++enum btree_insert_flags {
++	/* First two bits for journal watermark: */
++	__BTREE_INSERT_NOFAIL = 2,
++	__BTREE_INSERT_NOCHECK_RW,
++	__BTREE_INSERT_LAZY_RW,
++	__BTREE_INSERT_USE_RESERVE,
++	__BTREE_INSERT_JOURNAL_REPLAY,
++	__BTREE_INSERT_JOURNAL_RECLAIM,
++	__BTREE_INSERT_NOWAIT,
++	__BTREE_INSERT_GC_LOCK_HELD,
++	__BCH_HASH_SET_MUST_CREATE,
++	__BCH_HASH_SET_MUST_REPLACE,
++};
++
++/* Don't check for -ENOSPC: */
++#define BTREE_INSERT_NOFAIL		(1 << __BTREE_INSERT_NOFAIL)
++
++#define BTREE_INSERT_NOCHECK_RW		(1 << __BTREE_INSERT_NOCHECK_RW)
++#define BTREE_INSERT_LAZY_RW		(1 << __BTREE_INSERT_LAZY_RW)
++
++/* for copygc, or when merging btree nodes */
++#define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
++
++/* Insert is for journal replay - don't get journal reservations: */
++#define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
++
++/* Insert is being called from journal reclaim path: */
++#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
++
++/* Don't block on allocation failure (for new btree nodes: */
++#define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
++#define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
++
++#define BCH_HASH_SET_MUST_CREATE	(1 << __BCH_HASH_SET_MUST_CREATE)
++#define BCH_HASH_SET_MUST_REPLACE	(1 << __BCH_HASH_SET_MUST_REPLACE)
++
++int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
++				unsigned, unsigned);
++int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
++
++int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
++int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
++		     struct disk_reservation *, u64 *, int flags);
++
++int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
++				  struct bpos, struct bpos, unsigned, u64 *);
++int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
++			    struct bpos, struct bpos, unsigned, u64 *);
++
++int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
++			    struct btree *, unsigned);
++void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
++int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
++			       struct btree *, struct bkey_i *, bool);
++int bch2_btree_node_update_key_get_iter(struct btree_trans *,
++				struct btree *, struct bkey_i *, bool);
++
++int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
++			     struct bkey_i *, enum btree_update_flags);
++
++int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
++				   struct bkey_i *, enum btree_update_flags);
++
++void bch2_trans_commit_hook(struct btree_trans *,
++			    struct btree_trans_commit_hook *);
++int __bch2_trans_commit(struct btree_trans *);
++
++int bch2_trans_log_msg(struct btree_trans *, const char *);
++
++/**
++ * bch2_trans_commit - insert keys at given iterator positions
++ *
++ * This is main entry point for btree updates.
++ *
++ * Return values:
++ * -EROFS: filesystem read only
++ * -EIO: journal or btree node IO error
++ */
++static inline int bch2_trans_commit(struct btree_trans *trans,
++				    struct disk_reservation *disk_res,
++				    u64 *journal_seq,
++				    unsigned flags)
++{
++	trans->disk_res		= disk_res;
++	trans->journal_seq	= journal_seq;
++	trans->flags		= flags;
++
++	return __bch2_trans_commit(trans);
++}
++
++#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
++	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
++					(_journal_seq), (_flags)))
++
++#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
++	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
++					(_journal_seq), (_flags)))
++
++#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
++({									\
++	struct btree_trans trans;					\
++	int _ret;							\
++									\
++	bch2_trans_init(&trans, (_c), 0, 0);				\
++	_ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do);	\
++	bch2_trans_exit(&trans);					\
++									\
++	_ret;								\
++})
++
++#define bch2_trans_run(_c, _do)						\
++({									\
++	struct btree_trans trans;					\
++	int _ret;							\
++									\
++	bch2_trans_init(&trans, (_c), 0, 0);				\
++	_ret = (_do);							\
++	bch2_trans_exit(&trans);					\
++									\
++	_ret;								\
++})
++
++#define trans_for_each_update(_trans, _i)				\
++	for ((_i) = (_trans)->updates;					\
++	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
++	     (_i)++)
++
++static inline void bch2_trans_reset_updates(struct btree_trans *trans)
++{
++	struct btree_insert_entry *i;
++
++	trans_for_each_update(trans, i)
++		bch2_path_put(trans, i->path, true);
++
++	trans->extra_journal_res	= 0;
++	trans->nr_updates		= 0;
++	trans->hooks			= NULL;
++	trans->extra_journal_entries.nr	= 0;
++}
++
++#endif /* _BCACHEFS_BTREE_UPDATE_H */
+diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
+new file mode 100644
+index 000000000000..5525635ec04a
+--- /dev/null
++++ b/fs/bcachefs/btree_update_interior.c
+@@ -0,0 +1,2266 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "alloc_foreground.h"
++#include "bkey_methods.h"
++#include "btree_cache.h"
++#include "btree_gc.h"
++#include "btree_update.h"
++#include "btree_update_interior.h"
++#include "btree_io.h"
++#include "btree_iter.h"
++#include "btree_locking.h"
++#include "buckets.h"
++#include "error.h"
++#include "extents.h"
++#include "journal.h"
++#include "journal_reclaim.h"
++#include "keylist.h"
++#include "recovery.h"
++#include "replicas.h"
++#include "super-io.h"
++
++#include <linux/random.h>
++#include <trace/events/bcachefs.h>
++
++static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
++				   struct btree_path *, struct btree *,
++				   struct keylist *, unsigned);
++static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
++
++/* Debug code: */
++
++/*
++ * Verify that child nodes correctly span parent node's range:
++ */
++static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	struct bpos next_node = b->data->min_key;
++	struct btree_node_iter iter;
++	struct bkey_s_c k;
++	struct bkey_s_c_btree_ptr_v2 bp;
++	struct bkey unpacked;
++	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
++
++	BUG_ON(!b->c.level);
++
++	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
++		return;
++
++	bch2_btree_node_iter_init_from_start(&iter, b);
++
++	while (1) {
++		k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
++		if (k.k->type != KEY_TYPE_btree_ptr_v2)
++			break;
++		bp = bkey_s_c_to_btree_ptr_v2(k);
++
++		if (bpos_cmp(next_node, bp.v->min_key)) {
++			bch2_dump_btree_node(c, b);
++			bch2_bpos_to_text(&buf1, next_node);
++			bch2_bpos_to_text(&buf2, bp.v->min_key);
++			panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
++		}
++
++		bch2_btree_node_iter_advance(&iter, b);
++
++		if (bch2_btree_node_iter_end(&iter)) {
++			if (bpos_cmp(k.k->p, b->key.k.p)) {
++				bch2_dump_btree_node(c, b);
++				bch2_bpos_to_text(&buf1, b->key.k.p);
++				bch2_bpos_to_text(&buf2, k.k->p);
++				panic("expected end %s got %s\n", buf1.buf, buf2.buf);
++			}
++			break;
++		}
++
++		next_node = bpos_successor(k.k->p);
++	}
++#endif
++}
++
++/* Calculate ideal packed bkey format for new btree nodes: */
++
++void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
++{
++	struct bkey_packed *k;
++	struct bset_tree *t;
++	struct bkey uk;
++
++	for_each_bset(b, t)
++		bset_tree_for_each_key(b, t, k)
++			if (!bkey_deleted(k)) {
++				uk = bkey_unpack_key(b, k);
++				bch2_bkey_format_add_key(s, &uk);
++			}
++}
++
++static struct bkey_format bch2_btree_calc_format(struct btree *b)
++{
++	struct bkey_format_state s;
++
++	bch2_bkey_format_init(&s);
++	bch2_bkey_format_add_pos(&s, b->data->min_key);
++	bch2_bkey_format_add_pos(&s, b->data->max_key);
++	__bch2_btree_calc_format(&s, b);
++
++	return bch2_bkey_format_done(&s);
++}
++
++static size_t btree_node_u64s_with_format(struct btree *b,
++					  struct bkey_format *new_f)
++{
++	struct bkey_format *old_f = &b->format;
++
++	/* stupid integer promotion rules */
++	ssize_t delta =
++	    (((int) new_f->key_u64s - old_f->key_u64s) *
++	     (int) b->nr.packed_keys) +
++	    (((int) new_f->key_u64s - BKEY_U64s) *
++	     (int) b->nr.unpacked_keys);
++
++	BUG_ON(delta + b->nr.live_u64s < 0);
++
++	return b->nr.live_u64s + delta;
++}
++
++/**
++ * btree_node_format_fits - check if we could rewrite node with a new format
++ *
++ * This assumes all keys can pack with the new format -- it just checks if
++ * the re-packed keys would fit inside the node itself.
++ */
++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
++				 struct bkey_format *new_f)
++{
++	size_t u64s = btree_node_u64s_with_format(b, new_f);
++
++	return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
++}
++
++/* Btree node freeing/allocation: */
++
++static void __btree_node_free(struct bch_fs *c, struct btree *b)
++{
++	trace_btree_node_free(c, b);
++
++	BUG_ON(btree_node_dirty(b));
++	BUG_ON(btree_node_need_write(b));
++	BUG_ON(b == btree_node_root(c, b));
++	BUG_ON(b->ob.nr);
++	BUG_ON(!list_empty(&b->write_blocked));
++	BUG_ON(b->will_make_reachable);
++
++	clear_btree_node_noevict(b);
++
++	mutex_lock(&c->btree_cache.lock);
++	list_move(&b->list, &c->btree_cache.freeable);
++	mutex_unlock(&c->btree_cache.lock);
++}
++
++static void bch2_btree_node_free_inmem(struct btree_trans *trans,
++				       struct btree *b)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_path *path;
++
++	trans_for_each_path(trans, path)
++		BUG_ON(path->l[b->c.level].b == b &&
++		       path->l[b->c.level].lock_seq == b->c.lock.state.seq);
++
++	six_lock_write(&b->c.lock, NULL, NULL);
++
++	bch2_btree_node_hash_remove(&c->btree_cache, b);
++	__btree_node_free(c, b);
++
++	six_unlock_write(&b->c.lock);
++	six_unlock_intent(&b->c.lock);
++}
++
++static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
++					     struct disk_reservation *res,
++					     struct closure *cl,
++					     bool interior_node,
++					     unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct write_point *wp;
++	struct btree *b;
++	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
++	struct open_buckets ob = { .nr = 0 };
++	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
++	unsigned nr_reserve;
++	enum alloc_reserve alloc_reserve;
++
++	if (flags & BTREE_INSERT_USE_RESERVE) {
++		nr_reserve	= 0;
++		alloc_reserve	= RESERVE_btree_movinggc;
++	} else {
++		nr_reserve	= BTREE_NODE_RESERVE;
++		alloc_reserve	= RESERVE_btree;
++	}
++
++	mutex_lock(&c->btree_reserve_cache_lock);
++	if (c->btree_reserve_cache_nr > nr_reserve) {
++		struct btree_alloc *a =
++			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
++
++		ob = a->ob;
++		bkey_copy(&tmp.k, &a->k);
++		mutex_unlock(&c->btree_reserve_cache_lock);
++		goto mem_alloc;
++	}
++	mutex_unlock(&c->btree_reserve_cache_lock);
++
++retry:
++	wp = bch2_alloc_sectors_start_trans(trans,
++				      c->opts.metadata_target ?:
++				      c->opts.foreground_target,
++				      0,
++				      writepoint_ptr(&c->btree_write_point),
++				      &devs_have,
++				      res->nr_replicas,
++				      c->opts.metadata_replicas_required,
++				      alloc_reserve, 0, cl);
++	if (IS_ERR(wp))
++		return ERR_CAST(wp);
++
++	if (wp->sectors_free < btree_sectors(c)) {
++		struct open_bucket *ob;
++		unsigned i;
++
++		open_bucket_for_each(c, &wp->ptrs, ob, i)
++			if (ob->sectors_free < btree_sectors(c))
++				ob->sectors_free = 0;
++
++		bch2_alloc_sectors_done(c, wp);
++		goto retry;
++	}
++
++	bkey_btree_ptr_v2_init(&tmp.k);
++	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
++
++	bch2_open_bucket_get(c, wp, &ob);
++	bch2_alloc_sectors_done(c, wp);
++mem_alloc:
++	b = bch2_btree_node_mem_alloc(c, interior_node);
++	six_unlock_write(&b->c.lock);
++	six_unlock_intent(&b->c.lock);
++
++	/* we hold cannibalize_lock: */
++	BUG_ON(IS_ERR(b));
++	BUG_ON(b->ob.nr);
++
++	bkey_copy(&b->key, &tmp.k);
++	b->ob = ob;
++
++	return b;
++}
++
++static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
++{
++	struct bch_fs *c = as->c;
++	struct btree *b;
++	struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
++	int ret;
++
++	BUG_ON(level >= BTREE_MAX_DEPTH);
++	BUG_ON(!p->nr);
++
++	b = p->b[--p->nr];
++
++	six_lock_intent(&b->c.lock, NULL, NULL);
++	six_lock_write(&b->c.lock, NULL, NULL);
++
++	set_btree_node_accessed(b);
++	set_btree_node_dirty_acct(c, b);
++	set_btree_node_need_write(b);
++
++	bch2_bset_init_first(b, &b->data->keys);
++	b->c.level	= level;
++	b->c.btree_id	= as->btree_id;
++	b->version_ondisk = c->sb.version;
++
++	memset(&b->nr, 0, sizeof(b->nr));
++	b->data->magic = cpu_to_le64(bset_magic(c));
++	memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
++	b->data->flags = 0;
++	SET_BTREE_NODE_ID(b->data, as->btree_id);
++	SET_BTREE_NODE_LEVEL(b->data, level);
++
++	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
++		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
++
++		bp->v.mem_ptr		= 0;
++		bp->v.seq		= b->data->keys.seq;
++		bp->v.sectors_written	= 0;
++	}
++
++	SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
++
++	bch2_btree_build_aux_trees(b);
++
++	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
++	BUG_ON(ret);
++
++	trace_btree_node_alloc(c, b);
++	return b;
++}
++
++static void btree_set_min(struct btree *b, struct bpos pos)
++{
++	if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
++		bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
++	b->data->min_key = pos;
++}
++
++static void btree_set_max(struct btree *b, struct bpos pos)
++{
++	b->key.k.p = pos;
++	b->data->max_key = pos;
++}
++
++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
++						  struct btree *b,
++						  struct bkey_format format)
++{
++	struct btree *n;
++
++	n = bch2_btree_node_alloc(as, b->c.level);
++
++	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
++
++	btree_set_min(n, b->data->min_key);
++	btree_set_max(n, b->data->max_key);
++
++	n->data->format		= format;
++	btree_node_set_format(n, format);
++
++	bch2_btree_sort_into(as->c, n, b);
++
++	btree_node_reset_sib_u64s(n);
++
++	n->key.k.p = b->key.k.p;
++	return n;
++}
++
++static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
++						       struct btree *b)
++{
++	struct bkey_format new_f = bch2_btree_calc_format(b);
++
++	/*
++	 * The keys might expand with the new format - if they wouldn't fit in
++	 * the btree node anymore, use the old format for now:
++	 */
++	if (!bch2_btree_node_format_fits(as->c, b, &new_f))
++		new_f = b->format;
++
++	return __bch2_btree_node_alloc_replacement(as, b, new_f);
++}
++
++static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
++{
++	struct btree *b = bch2_btree_node_alloc(as, level);
++
++	btree_set_min(b, POS_MIN);
++	btree_set_max(b, SPOS_MAX);
++	b->data->format = bch2_btree_calc_format(b);
++
++	btree_node_set_format(b, b->data->format);
++	bch2_btree_build_aux_trees(b);
++
++	bch2_btree_update_add_new_node(as, b);
++	six_unlock_write(&b->c.lock);
++
++	return b;
++}
++
++static void bch2_btree_reserve_put(struct btree_update *as)
++{
++	struct bch_fs *c = as->c;
++	struct prealloc_nodes *p;
++
++	for (p = as->prealloc_nodes;
++	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
++	     p++) {
++		while (p->nr) {
++			struct btree *b = p->b[--p->nr];
++
++			mutex_lock(&c->btree_reserve_cache_lock);
++
++			if (c->btree_reserve_cache_nr <
++			    ARRAY_SIZE(c->btree_reserve_cache)) {
++				struct btree_alloc *a =
++					&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
++
++				a->ob = b->ob;
++				b->ob.nr = 0;
++				bkey_copy(&a->k, &b->key);
++			} else {
++				bch2_open_buckets_put(c, &b->ob);
++			}
++
++			mutex_unlock(&c->btree_reserve_cache_lock);
++
++			six_lock_intent(&b->c.lock, NULL, NULL);
++			six_lock_write(&b->c.lock, NULL, NULL);
++			__btree_node_free(c, b);
++			six_unlock_write(&b->c.lock);
++			six_unlock_intent(&b->c.lock);
++		}
++	}
++}
++
++static int bch2_btree_reserve_get(struct btree_trans *trans,
++				  struct btree_update *as,
++				  unsigned nr_nodes[2],
++				  unsigned flags,
++				  struct closure *cl)
++{
++	struct bch_fs *c = as->c;
++	struct btree *b;
++	unsigned interior;
++	int ret = 0;
++
++	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
++
++	/*
++	 * Protects reaping from the btree node cache and using the btree node
++	 * open bucket reserve:
++	 *
++	 * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
++	 * blocking on this lock:
++	 */
++	ret = bch2_btree_cache_cannibalize_lock(c, cl);
++	if (ret)
++		return ret;
++
++	for (interior = 0; interior < 2; interior++) {
++		struct prealloc_nodes *p = as->prealloc_nodes + interior;
++
++		while (p->nr < nr_nodes[interior]) {
++			b = __bch2_btree_node_alloc(trans, &as->disk_res,
++					flags & BTREE_INSERT_NOWAIT ? NULL : cl,
++					interior, flags);
++			if (IS_ERR(b)) {
++				ret = PTR_ERR(b);
++				goto err;
++			}
++
++			p->b[p->nr++] = b;
++		}
++	}
++err:
++	bch2_btree_cache_cannibalize_unlock(c);
++	return ret;
++}
++
++/* Asynchronous interior node update machinery */
++
++static void bch2_btree_update_free(struct btree_update *as)
++{
++	struct bch_fs *c = as->c;
++
++	if (as->took_gc_lock)
++		up_read(&c->gc_lock);
++	as->took_gc_lock = false;
++
++	bch2_journal_preres_put(&c->journal, &as->journal_preres);
++
++	bch2_journal_pin_drop(&c->journal, &as->journal);
++	bch2_journal_pin_flush(&c->journal, &as->journal);
++	bch2_disk_reservation_put(c, &as->disk_res);
++	bch2_btree_reserve_put(as);
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
++			       as->start_time);
++
++	mutex_lock(&c->btree_interior_update_lock);
++	list_del(&as->unwritten_list);
++	list_del(&as->list);
++
++	closure_debug_destroy(&as->cl);
++	mempool_free(as, &c->btree_interior_update_pool);
++
++	/*
++	 * Have to do the wakeup with btree_interior_update_lock still held,
++	 * since being on btree_interior_update_list is our ref on @c:
++	 */
++	closure_wake_up(&c->btree_interior_update_wait);
++
++	mutex_unlock(&c->btree_interior_update_lock);
++}
++
++static void btree_update_add_key(struct btree_update *as,
++				 struct keylist *keys, struct btree *b)
++{
++	struct bkey_i *k = &b->key;
++
++	BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
++	       ARRAY_SIZE(as->_old_keys));
++
++	bkey_copy(keys->top, k);
++	bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
++
++	bch2_keylist_push(keys);
++}
++
++/*
++ * The transactional part of an interior btree node update, where we journal the
++ * update we did to the interior node and update alloc info:
++ */
++static int btree_update_nodes_written_trans(struct btree_trans *trans,
++					    struct btree_update *as)
++{
++	struct bkey_i *k;
++	int ret;
++
++	ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
++	if (ret)
++		return ret;
++
++	memcpy(&darray_top(trans->extra_journal_entries),
++	       as->journal_entries,
++	       as->journal_u64s * sizeof(u64));
++	trans->extra_journal_entries.nr += as->journal_u64s;
++
++	trans->journal_pin = &as->journal;
++
++	for_each_keylist_key(&as->old_keys, k) {
++		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
++
++		ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
++		if (ret)
++			return ret;
++	}
++
++	for_each_keylist_key(&as->new_keys, k) {
++		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
++
++		ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++static void btree_update_nodes_written(struct btree_update *as)
++{
++	struct bch_fs *c = as->c;
++	struct btree *b = as->b;
++	struct btree_trans trans;
++	u64 journal_seq = 0;
++	unsigned i;
++	int ret;
++
++	/*
++	 * If we're already in an error state, it might be because a btree node
++	 * was never written, and we might be trying to free that same btree
++	 * node here, but it won't have been marked as allocated and we'll see
++	 * spurious disk usage inconsistencies in the transactional part below
++	 * if we don't skip it:
++	 */
++	ret = bch2_journal_error(&c->journal);
++	if (ret)
++		goto err;
++
++	/*
++	 * Wait for any in flight writes to finish before we free the old nodes
++	 * on disk:
++	 */
++	for (i = 0; i < as->nr_old_nodes; i++) {
++		struct btree *old = as->old_nodes[i];
++		__le64 seq;
++
++		six_lock_read(&old->c.lock, NULL, NULL);
++		seq = old->data ? old->data->keys.seq : 0;
++		six_unlock_read(&old->c.lock);
++
++		if (seq == as->old_nodes_seq[i])
++			wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
++				       TASK_UNINTERRUPTIBLE);
++	}
++
++	/*
++	 * We did an update to a parent node where the pointers we added pointed
++	 * to child nodes that weren't written yet: now, the child nodes have
++	 * been written so we can write out the update to the interior node.
++	 */
++
++	/*
++	 * We can't call into journal reclaim here: we'd block on the journal
++	 * reclaim lock, but we may need to release the open buckets we have
++	 * pinned in order for other btree updates to make forward progress, and
++	 * journal reclaim does btree updates when flushing bkey_cached entries,
++	 * which may require allocations as well.
++	 */
++	bch2_trans_init(&trans, c, 0, 512);
++	ret = commit_do(&trans, &as->disk_res, &journal_seq,
++			      BTREE_INSERT_NOFAIL|
++			      BTREE_INSERT_NOCHECK_RW|
++			      BTREE_INSERT_JOURNAL_RECLAIM|
++			      JOURNAL_WATERMARK_reserved,
++			      btree_update_nodes_written_trans(&trans, as));
++	bch2_trans_exit(&trans);
++
++	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
++			     "error %i in btree_update_nodes_written()", ret);
++err:
++	if (b) {
++		/*
++		 * @b is the node we did the final insert into:
++		 *
++		 * On failure to get a journal reservation, we still have to
++		 * unblock the write and allow most of the write path to happen
++		 * so that shutdown works, but the i->journal_seq mechanism
++		 * won't work to prevent the btree write from being visible (we
++		 * didn't get a journal sequence number) - instead
++		 * __bch2_btree_node_write() doesn't do the actual write if
++		 * we're in journal error state:
++		 */
++
++		six_lock_intent(&b->c.lock, NULL, NULL);
++		six_lock_write(&b->c.lock, NULL, NULL);
++		mutex_lock(&c->btree_interior_update_lock);
++
++		list_del(&as->write_blocked_list);
++		if (list_empty(&b->write_blocked))
++			clear_btree_node_write_blocked(b);
++
++		/*
++		 * Node might have been freed, recheck under
++		 * btree_interior_update_lock:
++		 */
++		if (as->b == b) {
++			struct bset *i = btree_bset_last(b);
++
++			BUG_ON(!b->c.level);
++			BUG_ON(!btree_node_dirty(b));
++
++			if (!ret) {
++				i->journal_seq = cpu_to_le64(
++							     max(journal_seq,
++								 le64_to_cpu(i->journal_seq)));
++
++				bch2_btree_add_journal_pin(c, b, journal_seq);
++			} else {
++				/*
++				 * If we didn't get a journal sequence number we
++				 * can't write this btree node, because recovery
++				 * won't know to ignore this write:
++				 */
++				set_btree_node_never_write(b);
++			}
++		}
++
++		mutex_unlock(&c->btree_interior_update_lock);
++		six_unlock_write(&b->c.lock);
++
++		btree_node_write_if_need(c, b, SIX_LOCK_intent);
++		six_unlock_intent(&b->c.lock);
++	}
++
++	bch2_journal_pin_drop(&c->journal, &as->journal);
++
++	bch2_journal_preres_put(&c->journal, &as->journal_preres);
++
++	mutex_lock(&c->btree_interior_update_lock);
++	for (i = 0; i < as->nr_new_nodes; i++) {
++		b = as->new_nodes[i];
++
++		BUG_ON(b->will_make_reachable != (unsigned long) as);
++		b->will_make_reachable = 0;
++		clear_btree_node_will_make_reachable(b);
++	}
++	mutex_unlock(&c->btree_interior_update_lock);
++
++	for (i = 0; i < as->nr_new_nodes; i++) {
++		b = as->new_nodes[i];
++
++		six_lock_read(&b->c.lock, NULL, NULL);
++		btree_node_write_if_need(c, b, SIX_LOCK_read);
++		six_unlock_read(&b->c.lock);
++	}
++
++	for (i = 0; i < as->nr_open_buckets; i++)
++		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
++
++	bch2_btree_update_free(as);
++}
++
++static void btree_interior_update_work(struct work_struct *work)
++{
++	struct bch_fs *c =
++		container_of(work, struct bch_fs, btree_interior_update_work);
++	struct btree_update *as;
++
++	while (1) {
++		mutex_lock(&c->btree_interior_update_lock);
++		as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
++					      struct btree_update, unwritten_list);
++		if (as && !as->nodes_written)
++			as = NULL;
++		mutex_unlock(&c->btree_interior_update_lock);
++
++		if (!as)
++			break;
++
++		btree_update_nodes_written(as);
++	}
++}
++
++static void btree_update_set_nodes_written(struct closure *cl)
++{
++	struct btree_update *as = container_of(cl, struct btree_update, cl);
++	struct bch_fs *c = as->c;
++
++	mutex_lock(&c->btree_interior_update_lock);
++	as->nodes_written = true;
++	mutex_unlock(&c->btree_interior_update_lock);
++
++	queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
++}
++
++/*
++ * We're updating @b with pointers to nodes that haven't finished writing yet:
++ * block @b from being written until @as completes
++ */
++static void btree_update_updated_node(struct btree_update *as, struct btree *b)
++{
++	struct bch_fs *c = as->c;
++
++	mutex_lock(&c->btree_interior_update_lock);
++	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
++
++	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
++	BUG_ON(!btree_node_dirty(b));
++
++	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
++	as->b		= b;
++
++	set_btree_node_write_blocked(b);
++	list_add(&as->write_blocked_list, &b->write_blocked);
++
++	mutex_unlock(&c->btree_interior_update_lock);
++}
++
++static void btree_update_reparent(struct btree_update *as,
++				  struct btree_update *child)
++{
++	struct bch_fs *c = as->c;
++
++	lockdep_assert_held(&c->btree_interior_update_lock);
++
++	child->b = NULL;
++	child->mode = BTREE_INTERIOR_UPDATING_AS;
++
++	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
++}
++
++static void btree_update_updated_root(struct btree_update *as, struct btree *b)
++{
++	struct bkey_i *insert = &b->key;
++	struct bch_fs *c = as->c;
++
++	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
++
++	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
++	       ARRAY_SIZE(as->journal_entries));
++
++	as->journal_u64s +=
++		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
++				  BCH_JSET_ENTRY_btree_root,
++				  b->c.btree_id, b->c.level,
++				  insert, insert->k.u64s);
++
++	mutex_lock(&c->btree_interior_update_lock);
++	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
++
++	as->mode	= BTREE_INTERIOR_UPDATING_ROOT;
++	mutex_unlock(&c->btree_interior_update_lock);
++}
++
++/*
++ * bch2_btree_update_add_new_node:
++ *
++ * This causes @as to wait on @b to be written, before it gets to
++ * bch2_btree_update_nodes_written
++ *
++ * Additionally, it sets b->will_make_reachable to prevent any additional writes
++ * to @b from happening besides the first until @b is reachable on disk
++ *
++ * And it adds @b to the list of @as's new nodes, so that we can update sector
++ * counts in bch2_btree_update_nodes_written:
++ */
++static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
++{
++	struct bch_fs *c = as->c;
++
++	closure_get(&as->cl);
++
++	mutex_lock(&c->btree_interior_update_lock);
++	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
++	BUG_ON(b->will_make_reachable);
++
++	as->new_nodes[as->nr_new_nodes++] = b;
++	b->will_make_reachable = 1UL|(unsigned long) as;
++	set_btree_node_will_make_reachable(b);
++
++	mutex_unlock(&c->btree_interior_update_lock);
++
++	btree_update_add_key(as, &as->new_keys, b);
++}
++
++/*
++ * returns true if @b was a new node
++ */
++static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
++{
++	struct btree_update *as;
++	unsigned long v;
++	unsigned i;
++
++	mutex_lock(&c->btree_interior_update_lock);
++	/*
++	 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
++	 * dropped when it gets written by bch2_btree_complete_write - the
++	 * xchg() is for synchronization with bch2_btree_complete_write:
++	 */
++	v = xchg(&b->will_make_reachable, 0);
++	clear_btree_node_will_make_reachable(b);
++	as = (struct btree_update *) (v & ~1UL);
++
++	if (!as) {
++		mutex_unlock(&c->btree_interior_update_lock);
++		return;
++	}
++
++	for (i = 0; i < as->nr_new_nodes; i++)
++		if (as->new_nodes[i] == b)
++			goto found;
++
++	BUG();
++found:
++	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
++	mutex_unlock(&c->btree_interior_update_lock);
++
++	if (v & 1)
++		closure_put(&as->cl);
++}
++
++static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
++{
++	while (b->ob.nr)
++		as->open_buckets[as->nr_open_buckets++] =
++			b->ob.v[--b->ob.nr];
++}
++
++/*
++ * @b is being split/rewritten: it may have pointers to not-yet-written btree
++ * nodes and thus outstanding btree_updates - redirect @b's
++ * btree_updates to point to this btree_update:
++ */
++static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
++						      struct btree *b)
++{
++	struct bch_fs *c = as->c;
++	struct btree_update *p, *n;
++	struct btree_write *w;
++
++	set_btree_node_dying(b);
++
++	if (btree_node_fake(b))
++		return;
++
++	mutex_lock(&c->btree_interior_update_lock);
++
++	/*
++	 * Does this node have any btree_update operations preventing
++	 * it from being written?
++	 *
++	 * If so, redirect them to point to this btree_update: we can
++	 * write out our new nodes, but we won't make them visible until those
++	 * operations complete
++	 */
++	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
++		list_del_init(&p->write_blocked_list);
++		btree_update_reparent(as, p);
++
++		/*
++		 * for flush_held_btree_writes() waiting on updates to flush or
++		 * nodes to be writeable:
++		 */
++		closure_wake_up(&c->btree_interior_update_wait);
++	}
++
++	clear_btree_node_dirty_acct(c, b);
++	clear_btree_node_need_write(b);
++
++	/*
++	 * Does this node have unwritten data that has a pin on the journal?
++	 *
++	 * If so, transfer that pin to the btree_update operation -
++	 * note that if we're freeing multiple nodes, we only need to keep the
++	 * oldest pin of any of the nodes we're freeing. We'll release the pin
++	 * when the new nodes are persistent and reachable on disk:
++	 */
++	w = btree_current_write(b);
++	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
++	bch2_journal_pin_drop(&c->journal, &w->journal);
++
++	w = btree_prev_write(b);
++	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
++	bch2_journal_pin_drop(&c->journal, &w->journal);
++
++	mutex_unlock(&c->btree_interior_update_lock);
++
++	/*
++	 * Is this a node that isn't reachable on disk yet?
++	 *
++	 * Nodes that aren't reachable yet have writes blocked until they're
++	 * reachable - now that we've cancelled any pending writes and moved
++	 * things waiting on that write to wait on this update, we can drop this
++	 * node from the list of nodes that the other update is making
++	 * reachable, prior to freeing it:
++	 */
++	btree_update_drop_new_node(c, b);
++
++	btree_update_add_key(as, &as->old_keys, b);
++
++	as->old_nodes[as->nr_old_nodes] = b;
++	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
++	as->nr_old_nodes++;
++}
++
++static void bch2_btree_update_done(struct btree_update *as)
++{
++	struct bch_fs *c = as->c;
++	u64 start_time = as->start_time;
++
++	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
++
++	if (as->took_gc_lock)
++		up_read(&as->c->gc_lock);
++	as->took_gc_lock = false;
++
++	bch2_btree_reserve_put(as);
++
++	continue_at(&as->cl, btree_update_set_nodes_written,
++		    as->c->btree_interior_update_worker);
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
++			       start_time);
++}
++
++static struct btree_update *
++bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
++			unsigned level, bool split, unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_update *as;
++	u64 start_time = local_clock();
++	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
++		? BCH_DISK_RESERVATION_NOFAIL : 0;
++	unsigned nr_nodes[2] = { 0, 0 };
++	unsigned update_level = level;
++	int journal_flags = flags & JOURNAL_WATERMARK_MASK;
++	int ret = 0;
++	u32 restart_count = trans->restart_count;
++
++	BUG_ON(!path->should_be_locked);
++
++	if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
++		journal_flags |= JOURNAL_RES_GET_NONBLOCK;
++
++	while (1) {
++		nr_nodes[!!update_level] += 1 + split;
++		update_level++;
++
++		if (!btree_path_node(path, update_level))
++			break;
++
++		/*
++		 * XXX: figure out how far we might need to split,
++		 * instead of locking/reserving all the way to the root:
++		 */
++		split = update_level + 1 < BTREE_MAX_DEPTH;
++	}
++
++	/* Might have to allocate a new root: */
++	if (update_level < BTREE_MAX_DEPTH)
++		nr_nodes[1] += 1;
++
++	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
++		trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
++						 path->btree_id, &path->pos);
++		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
++		return ERR_PTR(ret);
++	}
++
++	if (flags & BTREE_INSERT_GC_LOCK_HELD)
++		lockdep_assert_held(&c->gc_lock);
++	else if (!down_read_trylock(&c->gc_lock)) {
++		bch2_trans_unlock(trans);
++		down_read(&c->gc_lock);
++		ret = bch2_trans_relock(trans);
++		if (ret) {
++			up_read(&c->gc_lock);
++			return ERR_PTR(ret);
++		}
++	}
++
++	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
++	memset(as, 0, sizeof(*as));
++	closure_init(&as->cl, NULL);
++	as->c		= c;
++	as->start_time	= start_time;
++	as->mode	= BTREE_INTERIOR_NO_UPDATE;
++	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
++	as->btree_id	= path->btree_id;
++	INIT_LIST_HEAD(&as->list);
++	INIT_LIST_HEAD(&as->unwritten_list);
++	INIT_LIST_HEAD(&as->write_blocked_list);
++	bch2_keylist_init(&as->old_keys, as->_old_keys);
++	bch2_keylist_init(&as->new_keys, as->_new_keys);
++	bch2_keylist_init(&as->parent_keys, as->inline_keys);
++
++	mutex_lock(&c->btree_interior_update_lock);
++	list_add_tail(&as->list, &c->btree_interior_update_list);
++	mutex_unlock(&c->btree_interior_update_lock);
++
++	/*
++	 * We don't want to allocate if we're in an error state, that can cause
++	 * deadlock on emergency shutdown due to open buckets getting stuck in
++	 * the btree_reserve_cache after allocator shutdown has cleared it out.
++	 * This check needs to come after adding us to the btree_interior_update
++	 * list but before calling bch2_btree_reserve_get, to synchronize with
++	 * __bch2_fs_read_only().
++	 */
++	ret = bch2_journal_error(&c->journal);
++	if (ret)
++		goto err;
++
++	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
++				      BTREE_UPDATE_JOURNAL_RES,
++				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
++	if (ret) {
++		bch2_trans_unlock(trans);
++
++		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
++					      BTREE_UPDATE_JOURNAL_RES,
++					      journal_flags);
++		if (ret) {
++			trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
++			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
++			goto err;
++		}
++
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			goto err;
++	}
++
++	ret = bch2_disk_reservation_get(c, &as->disk_res,
++			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
++			c->opts.metadata_replicas,
++			disk_res_flags);
++	if (ret)
++		goto err;
++
++	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
++	if (ret && ret != -EINTR) {
++		struct closure cl;
++
++		closure_init_stack(&cl);
++
++		bch2_trans_unlock(trans);
++
++		do {
++			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
++			closure_sync(&cl);
++		} while (ret == -EAGAIN);
++
++		if (ret) {
++			trace_btree_reserve_get_fail(trans->fn, _RET_IP_,
++						     nr_nodes[0] + nr_nodes[1]);
++			goto err;
++		}
++	}
++
++	ret = bch2_trans_relock(trans);
++	if (ret)
++		goto err;
++
++	bch2_trans_verify_not_restarted(trans, restart_count);
++	return as;
++err:
++	bch2_btree_update_free(as);
++	return ERR_PTR(ret);
++}
++
++/* Btree root updates: */
++
++static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
++{
++	/* Root nodes cannot be reaped */
++	mutex_lock(&c->btree_cache.lock);
++	list_del_init(&b->list);
++	mutex_unlock(&c->btree_cache.lock);
++
++	mutex_lock(&c->btree_root_lock);
++	BUG_ON(btree_node_root(c, b) &&
++	       (b->c.level < btree_node_root(c, b)->c.level ||
++		!btree_node_dying(btree_node_root(c, b))));
++
++	btree_node_root(c, b) = b;
++	mutex_unlock(&c->btree_root_lock);
++
++	bch2_recalc_btree_reserve(c);
++}
++
++/**
++ * bch_btree_set_root - update the root in memory and on disk
++ *
++ * To ensure forward progress, the current task must not be holding any
++ * btree node write locks. However, you must hold an intent lock on the
++ * old root.
++ *
++ * Note: This allocates a journal entry but doesn't add any keys to
++ * it.  All the btree roots are part of every journal write, so there
++ * is nothing new to be done.  This just guarantees that there is a
++ * journal write.
++ */
++static void bch2_btree_set_root(struct btree_update *as,
++				struct btree_trans *trans,
++				struct btree_path *path,
++				struct btree *b)
++{
++	struct bch_fs *c = as->c;
++	struct btree *old;
++
++	trace_btree_set_root(c, b);
++	BUG_ON(!b->written);
++
++	old = btree_node_root(c, b);
++
++	/*
++	 * Ensure no one is using the old root while we switch to the
++	 * new root:
++	 */
++	bch2_btree_node_lock_write(trans, path, old);
++
++	bch2_btree_set_root_inmem(c, b);
++
++	btree_update_updated_root(as, b);
++
++	/*
++	 * Unlock old root after new root is visible:
++	 *
++	 * The new root isn't persistent, but that's ok: we still have
++	 * an intent lock on the new root, and any updates that would
++	 * depend on the new root would have to update the new root.
++	 */
++	bch2_btree_node_unlock_write(trans, path, old);
++}
++
++/* Interior node updates: */
++
++static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
++					struct btree_trans *trans,
++					struct btree_path *path,
++					struct btree *b,
++					struct btree_node_iter *node_iter,
++					struct bkey_i *insert)
++{
++	struct bch_fs *c = as->c;
++	struct bkey_packed *k;
++	struct printbuf buf = PRINTBUF;
++
++	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
++	       !btree_ptr_sectors_written(insert));
++
++	if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
++		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
++
++	if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
++			      btree_node_type(b), WRITE, &buf) ?:
++	    bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
++		printbuf_reset(&buf);
++		prt_printf(&buf, "inserting invalid bkey\n  ");
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
++		prt_printf(&buf, "\n  ");
++		bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
++				  btree_node_type(b), WRITE, &buf);
++		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
++
++		bch2_fs_inconsistent(c, "%s", buf.buf);
++		dump_stack();
++	}
++
++	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
++	       ARRAY_SIZE(as->journal_entries));
++
++	as->journal_u64s +=
++		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
++				  BCH_JSET_ENTRY_btree_keys,
++				  b->c.btree_id, b->c.level,
++				  insert, insert->k.u64s);
++
++	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
++	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
++		bch2_btree_node_iter_advance(node_iter, b);
++
++	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
++	set_btree_node_dirty_acct(c, b);
++	set_btree_node_need_write(b);
++
++	printbuf_exit(&buf);
++}
++
++static void
++__bch2_btree_insert_keys_interior(struct btree_update *as,
++				  struct btree_trans *trans,
++				  struct btree_path *path,
++				  struct btree *b,
++				  struct btree_node_iter node_iter,
++				  struct keylist *keys)
++{
++	struct bkey_i *insert = bch2_keylist_front(keys);
++	struct bkey_packed *k;
++
++	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
++
++	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
++	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
++		;
++
++	while (!bch2_keylist_empty(keys)) {
++		bch2_insert_fixup_btree_ptr(as, trans, path, b,
++				&node_iter, bch2_keylist_front(keys));
++		bch2_keylist_pop_front(keys);
++	}
++}
++
++/*
++ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
++ * node)
++ */
++static struct btree *__btree_split_node(struct btree_update *as,
++					struct btree *n1)
++{
++	struct bkey_format_state s;
++	size_t nr_packed = 0, nr_unpacked = 0;
++	struct btree *n2;
++	struct bset *set1, *set2;
++	struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
++	struct bpos n1_pos;
++
++	n2 = bch2_btree_node_alloc(as, n1->c.level);
++
++	n2->data->max_key	= n1->data->max_key;
++	n2->data->format	= n1->format;
++	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
++	n2->key.k.p = n1->key.k.p;
++
++	bch2_btree_update_add_new_node(as, n2);
++
++	set1 = btree_bset_first(n1);
++	set2 = btree_bset_first(n2);
++
++	/*
++	 * Has to be a linear search because we don't have an auxiliary
++	 * search tree yet
++	 */
++	k = set1->start;
++	while (1) {
++		struct bkey_packed *n = bkey_next(k);
++
++		if (n == vstruct_last(set1))
++			break;
++		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
++			break;
++
++		if (bkey_packed(k))
++			nr_packed++;
++		else
++			nr_unpacked++;
++
++		prev = k;
++		k = n;
++	}
++
++	BUG_ON(!prev);
++	set2_start	= k;
++	set2_end	= vstruct_last(set1);
++
++	set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
++	set_btree_bset_end(n1, n1->set);
++
++	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
++	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
++	n1->nr.packed_keys	= nr_packed;
++	n1->nr.unpacked_keys	= nr_unpacked;
++
++	n1_pos = bkey_unpack_pos(n1, prev);
++	if (as->c->sb.version < bcachefs_metadata_version_snapshot)
++		n1_pos.snapshot = U32_MAX;
++
++	btree_set_max(n1, n1_pos);
++	btree_set_min(n2, bpos_successor(n1->key.k.p));
++
++	bch2_bkey_format_init(&s);
++	bch2_bkey_format_add_pos(&s, n2->data->min_key);
++	bch2_bkey_format_add_pos(&s, n2->data->max_key);
++
++	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
++		struct bkey uk = bkey_unpack_key(n1, k);
++		bch2_bkey_format_add_key(&s, &uk);
++	}
++
++	n2->data->format = bch2_bkey_format_done(&s);
++	btree_node_set_format(n2, n2->data->format);
++
++	out = set2->start;
++	memset(&n2->nr, 0, sizeof(n2->nr));
++
++	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
++		BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
++				       ? &n1->format : &bch2_bkey_format_current, k));
++		out->format = KEY_FORMAT_LOCAL_BTREE;
++		btree_keys_account_key_add(&n2->nr, 0, out);
++		out = bkey_next(out);
++	}
++
++	set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
++	set_btree_bset_end(n2, n2->set);
++
++	BUG_ON(!set1->u64s);
++	BUG_ON(!set2->u64s);
++
++	btree_node_reset_sib_u64s(n1);
++	btree_node_reset_sib_u64s(n2);
++
++	bch2_verify_btree_nr_keys(n1);
++	bch2_verify_btree_nr_keys(n2);
++
++	if (n1->c.level) {
++		btree_node_interior_verify(as->c, n1);
++		btree_node_interior_verify(as->c, n2);
++	}
++
++	return n2;
++}
++
++/*
++ * For updates to interior nodes, we've got to do the insert before we split
++ * because the stuff we're inserting has to be inserted atomically. Post split,
++ * the keys might have to go in different nodes and the split would no longer be
++ * atomic.
++ *
++ * Worse, if the insert is from btree node coalescing, if we do the insert after
++ * we do the split (and pick the pivot) - the pivot we pick might be between
++ * nodes that were coalesced, and thus in the middle of a child node post
++ * coalescing:
++ */
++static void btree_split_insert_keys(struct btree_update *as,
++				    struct btree_trans *trans,
++				    struct btree_path *path,
++				    struct btree *b,
++				    struct keylist *keys)
++{
++	struct btree_node_iter node_iter;
++	struct bkey_i *k = bch2_keylist_front(keys);
++	struct bkey_packed *src, *dst, *n;
++	struct bset *i;
++
++	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
++
++	__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
++
++	/*
++	 * We can't tolerate whiteouts here - with whiteouts there can be
++	 * duplicate keys, and it would be rather bad if we picked a duplicate
++	 * for the pivot:
++	 */
++	i = btree_bset_first(b);
++	src = dst = i->start;
++	while (src != vstruct_last(i)) {
++		n = bkey_next(src);
++		if (!bkey_deleted(src)) {
++			memmove_u64s_down(dst, src, src->u64s);
++			dst = bkey_next(dst);
++		}
++		src = n;
++	}
++
++	/* Also clear out the unwritten whiteouts area: */
++	b->whiteout_u64s = 0;
++
++	i->u64s = cpu_to_le16((u64 *) dst - i->_data);
++	set_btree_bset_end(b, b->set);
++
++	BUG_ON(b->nsets != 1 ||
++	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
++
++	btree_node_interior_verify(as->c, b);
++}
++
++static void btree_split(struct btree_update *as, struct btree_trans *trans,
++			struct btree_path *path, struct btree *b,
++			struct keylist *keys, unsigned flags)
++{
++	struct bch_fs *c = as->c;
++	struct btree *parent = btree_node_parent(path, b);
++	struct btree *n1, *n2 = NULL, *n3 = NULL;
++	u64 start_time = local_clock();
++
++	BUG_ON(!parent && (b != btree_node_root(c, b)));
++	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
++
++	bch2_btree_interior_update_will_free_node(as, b);
++
++	n1 = bch2_btree_node_alloc_replacement(as, b);
++
++	if (keys)
++		btree_split_insert_keys(as, trans, path, n1, keys);
++
++	if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
++		trace_btree_split(c, b);
++
++		n2 = __btree_split_node(as, n1);
++
++		bch2_btree_build_aux_trees(n2);
++		bch2_btree_build_aux_trees(n1);
++		six_unlock_write(&n2->c.lock);
++		six_unlock_write(&n1->c.lock);
++
++		bch2_btree_update_add_new_node(as, n1);
++
++		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
++		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
++
++		/*
++		 * Note that on recursive parent_keys == keys, so we
++		 * can't start adding new keys to parent_keys before emptying it
++		 * out (which we did with btree_split_insert_keys() above)
++		 */
++		bch2_keylist_add(&as->parent_keys, &n1->key);
++		bch2_keylist_add(&as->parent_keys, &n2->key);
++
++		if (!parent) {
++			/* Depth increases, make a new root */
++			n3 = __btree_root_alloc(as, b->c.level + 1);
++
++			n3->sib_u64s[0] = U16_MAX;
++			n3->sib_u64s[1] = U16_MAX;
++
++			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
++
++			bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
++		}
++	} else {
++		trace_btree_compact(c, b);
++
++		bch2_btree_build_aux_trees(n1);
++		six_unlock_write(&n1->c.lock);
++
++		bch2_btree_update_add_new_node(as, n1);
++
++		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
++
++		if (parent)
++			bch2_keylist_add(&as->parent_keys, &n1->key);
++	}
++
++	/* New nodes all written, now make them visible: */
++
++	if (parent) {
++		/* Split a non root node */
++		bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
++	} else if (n3) {
++		bch2_btree_set_root(as, trans, path, n3);
++	} else {
++		/* Root filled up but didn't need to be split */
++		bch2_btree_set_root(as, trans, path, n1);
++	}
++
++	bch2_btree_update_get_open_buckets(as, n1);
++	if (n2)
++		bch2_btree_update_get_open_buckets(as, n2);
++	if (n3)
++		bch2_btree_update_get_open_buckets(as, n3);
++
++	/* Successful split, update the path to point to the new nodes: */
++
++	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
++	if (n3)
++		bch2_trans_node_add(trans, n3);
++	if (n2)
++		bch2_trans_node_add(trans, n2);
++	bch2_trans_node_add(trans, n1);
++
++	/*
++	 * The old node must be freed (in memory) _before_ unlocking the new
++	 * nodes - else another thread could re-acquire a read lock on the old
++	 * node after another thread has locked and updated the new node, thus
++	 * seeing stale data:
++	 */
++	bch2_btree_node_free_inmem(trans, b);
++
++	if (n3)
++		six_unlock_intent(&n3->c.lock);
++	if (n2)
++		six_unlock_intent(&n2->c.lock);
++	six_unlock_intent(&n1->c.lock);
++
++	bch2_trans_verify_locks(trans);
++
++	bch2_time_stats_update(&c->times[n2
++			       ? BCH_TIME_btree_node_split
++			       : BCH_TIME_btree_node_compact],
++			       start_time);
++}
++
++static void
++bch2_btree_insert_keys_interior(struct btree_update *as,
++				struct btree_trans *trans,
++				struct btree_path *path,
++				struct btree *b,
++				struct keylist *keys)
++{
++	struct btree_path *linked;
++
++	__bch2_btree_insert_keys_interior(as, trans, path, b,
++					  path->l[b->c.level].iter, keys);
++
++	btree_update_updated_node(as, b);
++
++	trans_for_each_path_with_node(trans, b, linked)
++		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
++
++	bch2_trans_verify_paths(trans);
++}
++
++/**
++ * bch_btree_insert_node - insert bkeys into a given btree node
++ *
++ * @iter:		btree iterator
++ * @keys:		list of keys to insert
++ * @hook:		insert callback
++ * @persistent:		if not null, @persistent will wait on journal write
++ *
++ * Inserts as many keys as it can into a given btree node, splitting it if full.
++ * If a split occurred, this function will return early. This can only happen
++ * for leaf nodes -- inserts into interior nodes have to be atomic.
++ */
++static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
++				   struct btree_path *path, struct btree *b,
++				   struct keylist *keys, unsigned flags)
++{
++	struct bch_fs *c = as->c;
++	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
++	int old_live_u64s = b->nr.live_u64s;
++	int live_u64s_added, u64s_added;
++
++	lockdep_assert_held(&c->gc_lock);
++	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
++	BUG_ON(!b->c.level);
++	BUG_ON(!as || as->b);
++	bch2_verify_keylist_sorted(keys);
++
++	bch2_btree_node_lock_for_insert(trans, path, b);
++
++	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
++		bch2_btree_node_unlock_write(trans, path, b);
++		goto split;
++	}
++
++	btree_node_interior_verify(c, b);
++
++	bch2_btree_insert_keys_interior(as, trans, path, b, keys);
++
++	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
++	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
++
++	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
++		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
++	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
++		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
++
++	if (u64s_added > live_u64s_added &&
++	    bch2_maybe_compact_whiteouts(c, b))
++		bch2_trans_node_reinit_iter(trans, b);
++
++	bch2_btree_node_unlock_write(trans, path, b);
++
++	btree_node_interior_verify(c, b);
++	return;
++split:
++	btree_split(as, trans, path, b, keys, flags);
++}
++
++int bch2_btree_split_leaf(struct btree_trans *trans,
++			  struct btree_path *path,
++			  unsigned flags)
++{
++	struct btree *b = path_l(path)->b;
++	struct btree_update *as;
++	unsigned l;
++	int ret = 0;
++
++	as = bch2_btree_update_start(trans, path, path->level,
++				     true, flags);
++	if (IS_ERR(as))
++		return PTR_ERR(as);
++
++	btree_split(as, trans, path, b, NULL, flags);
++	bch2_btree_update_done(as);
++
++	for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
++		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
++
++	return ret;
++}
++
++int __bch2_foreground_maybe_merge(struct btree_trans *trans,
++				  struct btree_path *path,
++				  unsigned level,
++				  unsigned flags,
++				  enum btree_node_sibling sib)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_path *sib_path = NULL;
++	struct btree_update *as;
++	struct bkey_format_state new_s;
++	struct bkey_format new_f;
++	struct bkey_i delete;
++	struct btree *b, *m, *n, *prev, *next, *parent;
++	struct bpos sib_pos;
++	size_t sib_u64s;
++	u64 start_time = local_clock();
++	int ret = 0;
++
++	BUG_ON(!path->should_be_locked);
++	BUG_ON(!btree_node_locked(path, level));
++
++	b = path->l[level].b;
++
++	if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
++	    (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
++		b->sib_u64s[sib] = U16_MAX;
++		return 0;
++	}
++
++	sib_pos = sib == btree_prev_sib
++		? bpos_predecessor(b->data->min_key)
++		: bpos_successor(b->data->max_key);
++
++	sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
++				 U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
++	ret = bch2_btree_path_traverse(trans, sib_path, false);
++	if (ret)
++		goto err;
++
++	sib_path->should_be_locked = true;
++
++	m = sib_path->l[level].b;
++
++	if (btree_node_parent(path, b) !=
++	    btree_node_parent(sib_path, m)) {
++		b->sib_u64s[sib] = U16_MAX;
++		goto out;
++	}
++
++	if (sib == btree_prev_sib) {
++		prev = m;
++		next = b;
++	} else {
++		prev = b;
++		next = m;
++	}
++
++	if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
++		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
++
++		bch2_bpos_to_text(&buf1, prev->data->max_key);
++		bch2_bpos_to_text(&buf2, next->data->min_key);
++		bch_err(c,
++			"btree topology error in btree merge:\n"
++			"  prev ends at   %s\n"
++			"  next starts at %s",
++			buf1.buf, buf2.buf);
++		printbuf_exit(&buf1);
++		printbuf_exit(&buf2);
++		bch2_topology_error(c);
++		ret = -EIO;
++		goto err;
++	}
++
++	bch2_bkey_format_init(&new_s);
++	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
++	__bch2_btree_calc_format(&new_s, prev);
++	__bch2_btree_calc_format(&new_s, next);
++	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
++	new_f = bch2_bkey_format_done(&new_s);
++
++	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
++		btree_node_u64s_with_format(m, &new_f);
++
++	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
++		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
++		sib_u64s /= 2;
++		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
++	}
++
++	sib_u64s = min(sib_u64s, btree_max_u64s(c));
++	sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
++	b->sib_u64s[sib] = sib_u64s;
++
++	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
++		goto out;
++
++	parent = btree_node_parent(path, b);
++	as = bch2_btree_update_start(trans, path, level, false,
++			 BTREE_INSERT_NOFAIL|
++			 BTREE_INSERT_USE_RESERVE|
++			 flags);
++	ret = PTR_ERR_OR_ZERO(as);
++	if (ret)
++		goto err;
++
++	trace_btree_merge(c, b);
++
++	bch2_btree_interior_update_will_free_node(as, b);
++	bch2_btree_interior_update_will_free_node(as, m);
++
++	n = bch2_btree_node_alloc(as, b->c.level);
++
++	SET_BTREE_NODE_SEQ(n->data,
++			   max(BTREE_NODE_SEQ(b->data),
++			       BTREE_NODE_SEQ(m->data)) + 1);
++
++	btree_set_min(n, prev->data->min_key);
++	btree_set_max(n, next->data->max_key);
++
++	bch2_btree_update_add_new_node(as, n);
++
++	n->data->format	 = new_f;
++	btree_node_set_format(n, new_f);
++
++	bch2_btree_sort_into(c, n, prev);
++	bch2_btree_sort_into(c, n, next);
++
++	bch2_btree_build_aux_trees(n);
++	six_unlock_write(&n->c.lock);
++
++	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
++
++	bkey_init(&delete.k);
++	delete.k.p = prev->key.k.p;
++	bch2_keylist_add(&as->parent_keys, &delete);
++	bch2_keylist_add(&as->parent_keys, &n->key);
++
++	bch2_trans_verify_paths(trans);
++
++	bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
++
++	bch2_trans_verify_paths(trans);
++
++	bch2_btree_update_get_open_buckets(as, n);
++
++	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
++	six_lock_increment(&m->c.lock, SIX_LOCK_intent);
++
++	bch2_trans_node_add(trans, n);
++
++	bch2_trans_verify_paths(trans);
++
++	bch2_btree_node_free_inmem(trans, b);
++	bch2_btree_node_free_inmem(trans, m);
++
++	six_unlock_intent(&n->c.lock);
++
++	bch2_btree_update_done(as);
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
++out:
++err:
++	bch2_path_put(trans, sib_path, true);
++	bch2_trans_verify_locks(trans);
++	return ret;
++}
++
++/**
++ * bch_btree_node_rewrite - Rewrite/move a btree node
++ */
++int bch2_btree_node_rewrite(struct btree_trans *trans,
++			    struct btree_iter *iter,
++			    struct btree *b,
++			    unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct btree *n, *parent;
++	struct btree_update *as;
++	int ret;
++
++	flags |= BTREE_INSERT_NOFAIL;
++
++	parent = btree_node_parent(iter->path, b);
++	as = bch2_btree_update_start(trans, iter->path, b->c.level,
++				     false, flags);
++	ret = PTR_ERR_OR_ZERO(as);
++	if (ret)
++		goto out;
++
++	bch2_btree_interior_update_will_free_node(as, b);
++
++	n = bch2_btree_node_alloc_replacement(as, b);
++	bch2_btree_update_add_new_node(as, n);
++
++	bch2_btree_build_aux_trees(n);
++	six_unlock_write(&n->c.lock);
++
++	trace_btree_rewrite(c, b);
++
++	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
++
++	if (parent) {
++		bch2_keylist_add(&as->parent_keys, &n->key);
++		bch2_btree_insert_node(as, trans, iter->path, parent,
++				       &as->parent_keys, flags);
++	} else {
++		bch2_btree_set_root(as, trans, iter->path, n);
++	}
++
++	bch2_btree_update_get_open_buckets(as, n);
++
++	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
++	bch2_trans_node_add(trans, n);
++	bch2_btree_node_free_inmem(trans, b);
++	six_unlock_intent(&n->c.lock);
++
++	bch2_btree_update_done(as);
++out:
++	bch2_btree_path_downgrade(trans, iter->path);
++	return ret;
++}
++
++struct async_btree_rewrite {
++	struct bch_fs		*c;
++	struct work_struct	work;
++	enum btree_id		btree_id;
++	unsigned		level;
++	struct bpos		pos;
++	__le64			seq;
++};
++
++static int async_btree_node_rewrite_trans(struct btree_trans *trans,
++					  struct async_btree_rewrite *a)
++{
++	struct btree_iter iter;
++	struct btree *b;
++	int ret;
++
++	bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
++				  BTREE_MAX_DEPTH, a->level, 0);
++	b = bch2_btree_iter_peek_node(&iter);
++	ret = PTR_ERR_OR_ZERO(b);
++	if (ret)
++		goto out;
++
++	if (!b || b->data->keys.seq != a->seq)
++		goto out;
++
++	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
++out :
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++void async_btree_node_rewrite_work(struct work_struct *work)
++{
++	struct async_btree_rewrite *a =
++		container_of(work, struct async_btree_rewrite, work);
++	struct bch_fs *c = a->c;
++
++	bch2_trans_do(c, NULL, NULL, 0,
++		      async_btree_node_rewrite_trans(&trans, a));
++	percpu_ref_put(&c->writes);
++	kfree(a);
++}
++
++void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
++{
++	struct async_btree_rewrite *a;
++
++	if (!percpu_ref_tryget_live(&c->writes))
++		return;
++
++	a = kmalloc(sizeof(*a), GFP_NOFS);
++	if (!a) {
++		percpu_ref_put(&c->writes);
++		return;
++	}
++
++	a->c		= c;
++	a->btree_id	= b->c.btree_id;
++	a->level	= b->c.level;
++	a->pos		= b->key.k.p;
++	a->seq		= b->data->keys.seq;
++
++	INIT_WORK(&a->work, async_btree_node_rewrite_work);
++	queue_work(c->btree_interior_update_worker, &a->work);
++}
++
++static int __bch2_btree_node_update_key(struct btree_trans *trans,
++					struct btree_iter *iter,
++					struct btree *b, struct btree *new_hash,
++					struct bkey_i *new_key,
++					bool skip_triggers)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter2 = { NULL };
++	struct btree *parent;
++	int ret;
++
++	if (!skip_triggers) {
++		ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
++					  bkey_i_to_s_c(&b->key), 0);
++		if (ret)
++			return ret;
++
++		ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
++					  new_key, 0);
++		if (ret)
++			return ret;
++	}
++
++	if (new_hash) {
++		bkey_copy(&new_hash->key, new_key);
++		ret = bch2_btree_node_hash_insert(&c->btree_cache,
++				new_hash, b->c.level, b->c.btree_id);
++		BUG_ON(ret);
++	}
++
++	parent = btree_node_parent(iter->path, b);
++	if (parent) {
++		bch2_trans_copy_iter(&iter2, iter);
++
++		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
++				iter2.flags & BTREE_ITER_INTENT,
++				_THIS_IP_);
++
++		BUG_ON(iter2.path->level != b->c.level);
++		BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
++
++		btree_node_unlock(trans, iter2.path, iter2.path->level);
++		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
++		iter2.path->level++;
++		btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE);
++
++		bch2_btree_path_check_sort(trans, iter2.path, 0);
++
++		ret   = bch2_btree_iter_traverse(&iter2) ?:
++			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
++		if (ret)
++			goto err;
++	} else {
++		BUG_ON(btree_node_root(c, b) != b);
++
++		ret = darray_make_room(&trans->extra_journal_entries,
++				       jset_u64s(new_key->k.u64s));
++		if (ret)
++			return ret;
++
++		journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
++				  BCH_JSET_ENTRY_btree_root,
++				  b->c.btree_id, b->c.level,
++				  new_key, new_key->k.u64s);
++		trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
++	}
++
++	ret = bch2_trans_commit(trans, NULL, NULL,
++				BTREE_INSERT_NOFAIL|
++				BTREE_INSERT_NOCHECK_RW|
++				BTREE_INSERT_USE_RESERVE|
++				BTREE_INSERT_JOURNAL_RECLAIM|
++				JOURNAL_WATERMARK_reserved);
++	if (ret)
++		goto err;
++
++	bch2_btree_node_lock_write(trans, iter->path, b);
++
++	if (new_hash) {
++		mutex_lock(&c->btree_cache.lock);
++		bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
++		bch2_btree_node_hash_remove(&c->btree_cache, b);
++
++		bkey_copy(&b->key, new_key);
++		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
++		BUG_ON(ret);
++		mutex_unlock(&c->btree_cache.lock);
++	} else {
++		bkey_copy(&b->key, new_key);
++	}
++
++	bch2_btree_node_unlock_write(trans, iter->path, b);
++out:
++	bch2_trans_iter_exit(trans, &iter2);
++	return ret;
++err:
++	if (new_hash) {
++		mutex_lock(&c->btree_cache.lock);
++		bch2_btree_node_hash_remove(&c->btree_cache, b);
++		mutex_unlock(&c->btree_cache.lock);
++	}
++	goto out;
++}
++
++int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
++			       struct btree *b, struct bkey_i *new_key,
++			       bool skip_triggers)
++{
++	struct bch_fs *c = trans->c;
++	struct btree *new_hash = NULL;
++	struct btree_path *path = iter->path;
++	struct closure cl;
++	int ret = 0;
++
++	if (!btree_node_intent_locked(path, b->c.level) &&
++	    !bch2_btree_path_upgrade(trans, path, b->c.level + 1))
++		return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
++
++	closure_init_stack(&cl);
++
++	/*
++	 * check btree_ptr_hash_val() after @b is locked by
++	 * btree_iter_traverse():
++	 */
++	if (btree_ptr_hash_val(new_key) != b->hash_val) {
++		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
++		if (ret) {
++			bch2_trans_unlock(trans);
++			closure_sync(&cl);
++			ret = bch2_trans_relock(trans);
++			if (ret)
++				return ret;
++		}
++
++		new_hash = bch2_btree_node_mem_alloc(c, false);
++	}
++
++	path->intent_ref++;
++	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
++					   new_key, skip_triggers);
++	--path->intent_ref;
++
++	if (new_hash) {
++		mutex_lock(&c->btree_cache.lock);
++		list_move(&new_hash->list, &c->btree_cache.freeable);
++		mutex_unlock(&c->btree_cache.lock);
++
++		six_unlock_write(&new_hash->c.lock);
++		six_unlock_intent(&new_hash->c.lock);
++	}
++	closure_sync(&cl);
++	bch2_btree_cache_cannibalize_unlock(c);
++	return ret;
++}
++
++int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
++					struct btree *b, struct bkey_i *new_key,
++					bool skip_triggers)
++{
++	struct btree_iter iter;
++	int ret;
++
++	bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
++				  BTREE_MAX_DEPTH, b->c.level,
++				  BTREE_ITER_INTENT);
++	ret = bch2_btree_iter_traverse(&iter);
++	if (ret)
++		goto out;
++
++	/* has node been freed? */
++	if (iter.path->l[b->c.level].b != b) {
++		/* node has been freed: */
++		BUG_ON(!btree_node_dying(b));
++		goto out;
++	}
++
++	BUG_ON(!btree_node_hashed(b));
++
++	ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers);
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++/* Init code: */
++
++/*
++ * Only for filesystem bringup, when first reading the btree roots or allocating
++ * btree roots when initializing a new filesystem:
++ */
++void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
++{
++	BUG_ON(btree_node_root(c, b));
++
++	bch2_btree_set_root_inmem(c, b);
++}
++
++void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
++{
++	struct closure cl;
++	struct btree *b;
++	int ret;
++
++	closure_init_stack(&cl);
++
++	do {
++		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
++		closure_sync(&cl);
++	} while (ret);
++
++	b = bch2_btree_node_mem_alloc(c, false);
++	bch2_btree_cache_cannibalize_unlock(c);
++
++	set_btree_node_fake(b);
++	set_btree_node_need_rewrite(b);
++	b->c.level	= 0;
++	b->c.btree_id	= id;
++
++	bkey_btree_ptr_init(&b->key);
++	b->key.k.p = SPOS_MAX;
++	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
++
++	bch2_bset_init_first(b, &b->data->keys);
++	bch2_btree_build_aux_trees(b);
++
++	b->data->flags = 0;
++	btree_set_min(b, POS_MIN);
++	btree_set_max(b, SPOS_MAX);
++	b->data->format = bch2_btree_calc_format(b);
++	btree_node_set_format(b, b->data->format);
++
++	ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
++					  b->c.level, b->c.btree_id);
++	BUG_ON(ret);
++
++	bch2_btree_set_root_inmem(c, b);
++
++	six_unlock_write(&b->c.lock);
++	six_unlock_intent(&b->c.lock);
++}
++
++void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	struct btree_update *as;
++
++	mutex_lock(&c->btree_interior_update_lock);
++	list_for_each_entry(as, &c->btree_interior_update_list, list)
++		prt_printf(out, "%p m %u w %u r %u j %llu\n",
++		       as,
++		       as->mode,
++		       as->nodes_written,
++		       atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
++		       as->journal.seq);
++	mutex_unlock(&c->btree_interior_update_lock);
++}
++
++static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
++{
++	bool ret;
++
++	mutex_lock(&c->btree_interior_update_lock);
++	ret = !list_empty(&c->btree_interior_update_list);
++	mutex_unlock(&c->btree_interior_update_lock);
++
++	return ret;
++}
++
++bool bch2_btree_interior_updates_flush(struct bch_fs *c)
++{
++	bool ret = bch2_btree_interior_updates_pending(c);
++
++	if (ret)
++		closure_wait_event(&c->btree_interior_update_wait,
++				   !bch2_btree_interior_updates_pending(c));
++	return ret;
++}
++
++void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
++{
++	struct btree_root *r;
++	struct jset_entry *entry;
++
++	mutex_lock(&c->btree_root_lock);
++
++	vstruct_for_each(jset, entry)
++		if (entry->type == BCH_JSET_ENTRY_btree_root) {
++			r = &c->btree_roots[entry->btree_id];
++			r->level = entry->level;
++			r->alive = true;
++			bkey_copy(&r->key, &entry->start[0]);
++		}
++
++	mutex_unlock(&c->btree_root_lock);
++}
++
++struct jset_entry *
++bch2_btree_roots_to_journal_entries(struct bch_fs *c,
++				    struct jset_entry *start,
++				    struct jset_entry *end)
++{
++	struct jset_entry *entry;
++	unsigned long have = 0;
++	unsigned i;
++
++	for (entry = start; entry < end; entry = vstruct_next(entry))
++		if (entry->type == BCH_JSET_ENTRY_btree_root)
++			__set_bit(entry->btree_id, &have);
++
++	mutex_lock(&c->btree_root_lock);
++
++	for (i = 0; i < BTREE_ID_NR; i++)
++		if (c->btree_roots[i].alive && !test_bit(i, &have)) {
++			journal_entry_set(end,
++					  BCH_JSET_ENTRY_btree_root,
++					  i, c->btree_roots[i].level,
++					  &c->btree_roots[i].key,
++					  c->btree_roots[i].key.u64s);
++			end = vstruct_next(end);
++		}
++
++	mutex_unlock(&c->btree_root_lock);
++
++	return end;
++}
++
++void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
++{
++	if (c->btree_interior_update_worker)
++		destroy_workqueue(c->btree_interior_update_worker);
++	mempool_exit(&c->btree_interior_update_pool);
++}
++
++int bch2_fs_btree_interior_update_init(struct bch_fs *c)
++{
++	mutex_init(&c->btree_reserve_cache_lock);
++	INIT_LIST_HEAD(&c->btree_interior_update_list);
++	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
++	mutex_init(&c->btree_interior_update_lock);
++	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
++
++	c->btree_interior_update_worker =
++		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
++	if (!c->btree_interior_update_worker)
++		return -ENOMEM;
++
++	return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
++					 sizeof(struct btree_update));
++}
+diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
+new file mode 100644
+index 000000000000..adfc6c24a7a4
+--- /dev/null
++++ b/fs/bcachefs/btree_update_interior.h
+@@ -0,0 +1,321 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
++#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
++
++#include "btree_cache.h"
++#include "btree_locking.h"
++#include "btree_update.h"
++
++void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
++				struct bkey_format *);
++
++#define BTREE_UPDATE_NODES_MAX		((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
++
++#define BTREE_UPDATE_JOURNAL_RES	(BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
++
++/*
++ * Tracks an in progress split/rewrite of a btree node and the update to the
++ * parent node:
++ *
++ * When we split/rewrite a node, we do all the updates in memory without
++ * waiting for any writes to complete - we allocate the new node(s) and update
++ * the parent node, possibly recursively up to the root.
++ *
++ * The end result is that we have one or more new nodes being written -
++ * possibly several, if there were multiple splits - and then a write (updating
++ * an interior node) which will make all these new nodes visible.
++ *
++ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
++ * nodes can't be freed (their space on disk can't be reclaimed) until the
++ * update to the interior node that makes the new node visible completes -
++ * until then, the old nodes are still reachable on disk.
++ *
++ */
++struct btree_update {
++	struct closure			cl;
++	struct bch_fs			*c;
++	u64				start_time;
++
++	struct list_head		list;
++	struct list_head		unwritten_list;
++
++	/* What kind of update are we doing? */
++	enum {
++		BTREE_INTERIOR_NO_UPDATE,
++		BTREE_INTERIOR_UPDATING_NODE,
++		BTREE_INTERIOR_UPDATING_ROOT,
++		BTREE_INTERIOR_UPDATING_AS,
++	} mode;
++
++	unsigned			nodes_written:1;
++	unsigned			took_gc_lock:1;
++
++	enum btree_id			btree_id;
++
++	struct disk_reservation		disk_res;
++	struct journal_preres		journal_preres;
++
++	/*
++	 * BTREE_INTERIOR_UPDATING_NODE:
++	 * The update that made the new nodes visible was a regular update to an
++	 * existing interior node - @b. We can't write out the update to @b
++	 * until the new nodes we created are finished writing, so we block @b
++	 * from writing by putting this btree_interior update on the
++	 * @b->write_blocked list with @write_blocked_list:
++	 */
++	struct btree			*b;
++	struct list_head		write_blocked_list;
++
++	/*
++	 * We may be freeing nodes that were dirty, and thus had journal entries
++	 * pinned: we need to transfer the oldest of those pins to the
++	 * btree_update operation, and release it when the new node(s)
++	 * are all persistent and reachable:
++	 */
++	struct journal_entry_pin	journal;
++
++	/* Preallocated nodes we reserve when we start the update: */
++	struct prealloc_nodes {
++		struct btree		*b[BTREE_UPDATE_NODES_MAX];
++		unsigned		nr;
++	}				prealloc_nodes[2];
++
++	/* Nodes being freed: */
++	struct keylist			old_keys;
++	u64				_old_keys[BTREE_UPDATE_NODES_MAX *
++						  BKEY_BTREE_PTR_U64s_MAX];
++
++	/* Nodes being added: */
++	struct keylist			new_keys;
++	u64				_new_keys[BTREE_UPDATE_NODES_MAX *
++						  BKEY_BTREE_PTR_U64s_MAX];
++
++	/* New nodes, that will be made reachable by this update: */
++	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
++	unsigned			nr_new_nodes;
++
++	struct btree			*old_nodes[BTREE_UPDATE_NODES_MAX];
++	__le64				old_nodes_seq[BTREE_UPDATE_NODES_MAX];
++	unsigned			nr_old_nodes;
++
++	open_bucket_idx_t		open_buckets[BTREE_UPDATE_NODES_MAX *
++						     BCH_REPLICAS_MAX];
++	open_bucket_idx_t		nr_open_buckets;
++
++	unsigned			journal_u64s;
++	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
++
++	/* Only here to reduce stack usage on recursive splits: */
++	struct keylist			parent_keys;
++	/*
++	 * Enough room for btree_split's keys without realloc - btree node
++	 * pointers never have crc/compression info, so we only need to acount
++	 * for the pointers for three keys
++	 */
++	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
++};
++
++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
++						  struct btree *,
++						  struct bkey_format);
++
++int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
++
++int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
++				  unsigned, unsigned, enum btree_node_sibling);
++
++static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
++					struct btree_path *path,
++					unsigned level, unsigned flags,
++					enum btree_node_sibling sib)
++{
++	struct btree *b;
++
++	EBUG_ON(!btree_node_locked(path, level));
++
++	b = path->l[level].b;
++	if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
++		return 0;
++
++	return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
++}
++
++static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
++					      struct btree_path *path,
++					      unsigned level,
++					      unsigned flags)
++{
++	return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
++						    btree_prev_sib) ?:
++		bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
++						    btree_next_sib);
++}
++
++void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
++void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
++
++static inline unsigned btree_update_reserve_required(struct bch_fs *c,
++						     struct btree *b)
++{
++	unsigned depth = btree_node_root(c, b)->c.level + 1;
++
++	/*
++	 * Number of nodes we might have to allocate in a worst case btree
++	 * split operation - we split all the way up to the root, then allocate
++	 * a new root, unless we're already at max depth:
++	 */
++	if (depth < BTREE_MAX_DEPTH)
++		return (depth - b->c.level) * 2 + 1;
++	else
++		return (depth - b->c.level) * 2 - 1;
++}
++
++static inline void btree_node_reset_sib_u64s(struct btree *b)
++{
++	b->sib_u64s[0] = b->nr.live_u64s;
++	b->sib_u64s[1] = b->nr.live_u64s;
++}
++
++static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
++{
++	return (void *) b->data + btree_bytes(c);
++}
++
++static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
++							    struct btree *b)
++{
++	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
++}
++
++static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
++							  struct btree *b)
++{
++	return btree_data_end(c, b);
++}
++
++static inline void *write_block(struct btree *b)
++{
++	return (void *) b->data + (b->written << 9);
++}
++
++static inline bool __btree_addr_written(struct btree *b, void *p)
++{
++	return p < write_block(b);
++}
++
++static inline bool bset_written(struct btree *b, struct bset *i)
++{
++	return __btree_addr_written(b, i);
++}
++
++static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
++{
++	return __btree_addr_written(b, k);
++}
++
++static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
++						 struct btree *b,
++						 void *end)
++{
++	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
++		b->whiteout_u64s;
++	ssize_t total = c->opts.btree_node_size >> 3;
++
++	/* Always leave one extra u64 for bch2_varint_decode: */
++	used++;
++
++	return total - used;
++}
++
++static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
++						   struct btree *b)
++{
++	ssize_t remaining = __bch_btree_u64s_remaining(c, b,
++				btree_bkey_last(b, bset_tree_last(b)));
++
++	BUG_ON(remaining < 0);
++
++	if (bset_written(b, btree_bset_last(b)))
++		return 0;
++
++	return remaining;
++}
++
++#define BTREE_WRITE_SET_U64s_BITS	9
++
++static inline unsigned btree_write_set_buffer(struct btree *b)
++{
++	/*
++	 * Could buffer up larger amounts of keys for btrees with larger keys,
++	 * pending benchmarking:
++	 */
++	return 8 << BTREE_WRITE_SET_U64s_BITS;
++}
++
++static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
++						     struct btree *b)
++{
++	struct bset_tree *t = bset_tree_last(b);
++	struct btree_node_entry *bne = max(write_block(b),
++			(void *) btree_bkey_last(b, bset_tree_last(b)));
++	ssize_t remaining_space =
++		__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
++
++	if (unlikely(bset_written(b, bset(b, t)))) {
++		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
++			return bne;
++	} else {
++		if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
++		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
++			return bne;
++	}
++
++	return NULL;
++}
++
++static inline void push_whiteout(struct bch_fs *c, struct btree *b,
++				 struct bpos pos)
++{
++	struct bkey_packed k;
++
++	BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
++
++	if (!bkey_pack_pos(&k, pos, b)) {
++		struct bkey *u = (void *) &k;
++
++		bkey_init(u);
++		u->p = pos;
++	}
++
++	k.needs_whiteout = true;
++
++	b->whiteout_u64s += k.u64s;
++	bkey_copy(unwritten_whiteouts_start(c, b), &k);
++}
++
++/*
++ * write lock must be held on @b (else the dirty bset that we were going to
++ * insert into could be written out from under us)
++ */
++static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
++					       struct btree *b, unsigned u64s)
++{
++	if (unlikely(btree_node_need_rewrite(b)))
++		return false;
++
++	return u64s <= bch_btree_keys_u64s_remaining(c, b);
++}
++
++void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
++
++bool bch2_btree_interior_updates_flush(struct bch_fs *);
++
++void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
++struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
++					struct jset_entry *, struct jset_entry *);
++
++void bch2_fs_btree_interior_update_exit(struct bch_fs *);
++int bch2_fs_btree_interior_update_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
+diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
+new file mode 100644
+index 000000000000..e2ecbd3bca77
+--- /dev/null
++++ b/fs/bcachefs/btree_update_leaf.c
+@@ -0,0 +1,1800 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "btree_update.h"
++#include "btree_update_interior.h"
++#include "btree_gc.h"
++#include "btree_io.h"
++#include "btree_iter.h"
++#include "btree_key_cache.h"
++#include "btree_locking.h"
++#include "buckets.h"
++#include "debug.h"
++#include "errcode.h"
++#include "error.h"
++#include "extent_update.h"
++#include "journal.h"
++#include "journal_reclaim.h"
++#include "keylist.h"
++#include "recovery.h"
++#include "subvolume.h"
++#include "replicas.h"
++
++#include <linux/prefetch.h>
++#include <linux/sort.h>
++#include <trace/events/bcachefs.h>
++
++static int __must_check
++bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
++			  struct bkey_i *, enum btree_update_flags);
++
++static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
++					 const struct btree_insert_entry *r)
++{
++	return   cmp_int(l->btree_id,	r->btree_id) ?:
++		 cmp_int(l->cached,	r->cached) ?:
++		 -cmp_int(l->level,	r->level) ?:
++		 bpos_cmp(l->k->k.p,	r->k->k.p);
++}
++
++static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
++{
++	return i->path->l + i->level;
++}
++
++static inline bool same_leaf_as_prev(struct btree_trans *trans,
++				     struct btree_insert_entry *i)
++{
++	return i != trans->updates &&
++		insert_l(&i[0])->b == insert_l(&i[-1])->b;
++}
++
++static inline bool same_leaf_as_next(struct btree_trans *trans,
++				     struct btree_insert_entry *i)
++{
++	return i + 1 < trans->updates + trans->nr_updates &&
++		insert_l(&i[0])->b == insert_l(&i[1])->b;
++}
++
++static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
++						  struct btree_path *path,
++						  struct btree *b)
++{
++	struct bch_fs *c = trans->c;
++
++	if (path->cached)
++		return;
++
++	if (unlikely(btree_node_just_written(b)) &&
++	    bch2_btree_post_write_cleanup(c, b))
++		bch2_trans_node_reinit_iter(trans, b);
++
++	/*
++	 * If the last bset has been written, or if it's gotten too big - start
++	 * a new bset to insert into:
++	 */
++	if (want_new_bset(c, b))
++		bch2_btree_init_next(trans, b);
++}
++
++void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
++				     struct btree_path *path,
++				     struct btree *b)
++{
++	bch2_btree_node_lock_write(trans, path, b);
++	bch2_btree_node_prep_for_write(trans, path, b);
++}
++
++/* Inserting into a given leaf node (last stage of insert): */
++
++/* Handle overwrites and do insert, for non extents: */
++bool bch2_btree_bset_insert_key(struct btree_trans *trans,
++				struct btree_path *path,
++				struct btree *b,
++				struct btree_node_iter *node_iter,
++				struct bkey_i *insert)
++{
++	struct bkey_packed *k;
++	unsigned clobber_u64s = 0, new_u64s = 0;
++
++	EBUG_ON(btree_node_just_written(b));
++	EBUG_ON(bset_written(b, btree_bset_last(b)));
++	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
++	EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
++	EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
++	EBUG_ON(insert->k.u64s >
++		bch_btree_keys_u64s_remaining(trans->c, b));
++
++	k = bch2_btree_node_iter_peek_all(node_iter, b);
++	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
++		k = NULL;
++
++	/* @k is the key being overwritten/deleted, if any: */
++	EBUG_ON(k && bkey_deleted(k));
++
++	/* Deleting, but not found? nothing to do: */
++	if (bkey_deleted(&insert->k) && !k)
++		return false;
++
++	if (bkey_deleted(&insert->k)) {
++		/* Deleting: */
++		btree_account_key_drop(b, k);
++		k->type = KEY_TYPE_deleted;
++
++		if (k->needs_whiteout)
++			push_whiteout(trans->c, b, insert->k.p);
++		k->needs_whiteout = false;
++
++		if (k >= btree_bset_last(b)->start) {
++			clobber_u64s = k->u64s;
++			bch2_bset_delete(b, k, clobber_u64s);
++			goto fix_iter;
++		} else {
++			bch2_btree_path_fix_key_modified(trans, b, k);
++		}
++
++		return true;
++	}
++
++	if (k) {
++		/* Overwriting: */
++		btree_account_key_drop(b, k);
++		k->type = KEY_TYPE_deleted;
++
++		insert->k.needs_whiteout = k->needs_whiteout;
++		k->needs_whiteout = false;
++
++		if (k >= btree_bset_last(b)->start) {
++			clobber_u64s = k->u64s;
++			goto overwrite;
++		} else {
++			bch2_btree_path_fix_key_modified(trans, b, k);
++		}
++	}
++
++	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
++overwrite:
++	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
++	new_u64s = k->u64s;
++fix_iter:
++	if (clobber_u64s != new_u64s)
++		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
++					 clobber_u64s, new_u64s);
++	return true;
++}
++
++static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
++			       unsigned i, u64 seq)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct btree_write *w = container_of(pin, struct btree_write, journal);
++	struct btree *b = container_of(w, struct btree, writes[i]);
++	unsigned long old, new, v;
++	unsigned idx = w - b->writes;
++
++	six_lock_read(&b->c.lock, NULL, NULL);
++	v = READ_ONCE(b->flags);
++
++	do {
++		old = new = v;
++
++		if (!(old & (1 << BTREE_NODE_dirty)) ||
++		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
++		    w->journal.seq != seq)
++			break;
++
++		new |= 1 << BTREE_NODE_need_write;
++	} while ((v = cmpxchg(&b->flags, old, new)) != old);
++
++	btree_node_write_if_need(c, b, SIX_LOCK_read);
++	six_unlock_read(&b->c.lock);
++	return 0;
++}
++
++static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
++{
++	return __btree_node_flush(j, pin, 0, seq);
++}
++
++static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
++{
++	return __btree_node_flush(j, pin, 1, seq);
++}
++
++inline void bch2_btree_add_journal_pin(struct bch_fs *c,
++				       struct btree *b, u64 seq)
++{
++	struct btree_write *w = btree_current_write(b);
++
++	bch2_journal_pin_add(&c->journal, seq, &w->journal,
++			     btree_node_write_idx(b) == 0
++			     ? btree_node_flush0
++			     : btree_node_flush1);
++}
++
++/**
++ * btree_insert_key - insert a key one key into a leaf node
++ */
++static void btree_insert_key_leaf(struct btree_trans *trans,
++				  struct btree_insert_entry *insert)
++{
++	struct bch_fs *c = trans->c;
++	struct btree *b = insert_l(insert)->b;
++	struct bset_tree *t = bset_tree_last(b);
++	struct bset *i = bset(b, t);
++	int old_u64s = bset_u64s(t);
++	int old_live_u64s = b->nr.live_u64s;
++	int live_u64s_added, u64s_added;
++
++	if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
++					&insert_l(insert)->iter, insert->k)))
++		return;
++
++	i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
++					 le64_to_cpu(i->journal_seq)));
++
++	bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
++
++	if (unlikely(!btree_node_dirty(b)))
++		set_btree_node_dirty_acct(c, b);
++
++	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
++	u64s_added = (int) bset_u64s(t) - old_u64s;
++
++	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
++		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
++	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
++		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
++
++	if (u64s_added > live_u64s_added &&
++	    bch2_maybe_compact_whiteouts(c, b))
++		bch2_trans_node_reinit_iter(trans, b);
++}
++
++/* Cached btree updates: */
++
++/* Normal update interface: */
++
++static inline void btree_insert_entry_checks(struct btree_trans *trans,
++					     struct btree_insert_entry *i)
++{
++	BUG_ON(bpos_cmp(i->k->k.p, i->path->pos));
++	BUG_ON(i->cached	!= i->path->cached);
++	BUG_ON(i->level		!= i->path->level);
++	BUG_ON(i->btree_id	!= i->path->btree_id);
++	EBUG_ON(!i->level &&
++		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
++		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
++		i->k->k.p.snapshot &&
++		bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
++}
++
++static noinline int
++bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
++				   unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++	int ret;
++
++	bch2_trans_unlock(trans);
++
++	ret = bch2_journal_preres_get(&c->journal,
++			&trans->journal_preres, u64s, 0);
++	if (ret)
++		return ret;
++
++	ret = bch2_trans_relock(trans);
++	if (ret) {
++		trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
++		return ret;
++	}
++
++	return 0;
++}
++
++static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
++					     unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	int ret;
++
++	ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
++				   trans->journal_u64s,
++				   flags|
++				   (trans->flags & JOURNAL_WATERMARK_MASK));
++
++	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
++}
++
++#define JSET_ENTRY_LOG_U64s		4
++
++static void journal_transaction_name(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct journal *j = &c->journal;
++	struct jset_entry *entry =
++		bch2_journal_add_entry(j, &trans->journal_res,
++				       BCH_JSET_ENTRY_log, 0, 0,
++				       JSET_ENTRY_LOG_U64s);
++	struct jset_entry_log *l =
++		container_of(entry, struct jset_entry_log, entry);
++
++	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
++}
++
++static inline enum btree_insert_ret
++btree_key_can_insert(struct btree_trans *trans,
++		     struct btree *b,
++		     unsigned u64s)
++{
++	struct bch_fs *c = trans->c;
++
++	if (!bch2_btree_node_insert_fits(c, b, u64s))
++		return BTREE_INSERT_BTREE_NODE_FULL;
++
++	return BTREE_INSERT_OK;
++}
++
++static enum btree_insert_ret
++btree_key_can_insert_cached(struct btree_trans *trans,
++			    struct btree_path *path,
++			    unsigned u64s)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_cached *ck = (void *) path->l[0].b;
++	unsigned old_u64s = ck->u64s, new_u64s;
++	struct bkey_i *new_k;
++
++	EBUG_ON(path->level);
++
++	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
++	    bch2_btree_key_cache_must_wait(c) &&
++	    !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
++		return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
++
++	/*
++	 * bch2_varint_decode can read past the end of the buffer by at most 7
++	 * bytes (it won't be used):
++	 */
++	u64s += 1;
++
++	if (u64s <= ck->u64s)
++		return BTREE_INSERT_OK;
++
++	new_u64s	= roundup_pow_of_two(u64s);
++	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
++	if (!new_k) {
++		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
++			bch2_btree_ids[path->btree_id], new_u64s);
++		return -ENOMEM;
++	}
++
++	ck->u64s	= new_u64s;
++	ck->k		= new_k;
++	/*
++	 * Keys returned by peek() are no longer valid pointers, so we need a
++	 * transaction restart:
++	 */
++	trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_,
++					     path->btree_id, &path->pos,
++					     old_u64s, new_u64s);
++	return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced);
++}
++
++/* Triggers: */
++
++static int run_one_mem_trigger(struct btree_trans *trans,
++			       struct btree_insert_entry *i,
++			       unsigned flags)
++{
++	struct bkey_s_c old = { &i->old_k, i->old_v };
++	struct bkey_i *new = i->k;
++	int ret;
++
++	if (unlikely(flags & BTREE_TRIGGER_NORUN))
++		return 0;
++
++	if (!btree_node_type_needs_gc(i->btree_id))
++		return 0;
++
++	if (bch2_bkey_ops[old.k->type].atomic_trigger ==
++	    bch2_bkey_ops[i->k->k.type].atomic_trigger &&
++	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
++		ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
++				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
++	} else {
++		struct bkey		_deleted = KEY(0, 0, 0);
++		struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
++
++		_deleted.p = i->path->pos;
++
++		ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
++				BTREE_TRIGGER_INSERT|flags) ?:
++			bch2_mark_key(trans, old, deleted,
++				BTREE_TRIGGER_OVERWRITE|flags);
++	}
++
++	return ret;
++}
++
++static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
++				 bool overwrite)
++{
++	/*
++	 * Transactional triggers create new btree_insert_entries, so we can't
++	 * pass them a pointer to a btree_insert_entry, that memory is going to
++	 * move:
++	 */
++	struct bkey old_k = i->old_k;
++	struct bkey_s_c old = { &old_k, i->old_v };
++
++	if ((i->flags & BTREE_TRIGGER_NORUN) ||
++	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
++		return 0;
++
++	if (!i->insert_trigger_run &&
++	    !i->overwrite_trigger_run &&
++	    bch2_bkey_ops[old.k->type].trans_trigger ==
++	    bch2_bkey_ops[i->k->k.type].trans_trigger &&
++	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
++		i->overwrite_trigger_run = true;
++		i->insert_trigger_run = true;
++		return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
++					   BTREE_TRIGGER_INSERT|
++					   BTREE_TRIGGER_OVERWRITE|
++					   i->flags) ?: 1;
++	} else if (overwrite && !i->overwrite_trigger_run) {
++		i->overwrite_trigger_run = true;
++		return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
++	} else if (!overwrite && !i->insert_trigger_run) {
++		i->insert_trigger_run = true;
++		return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
++	} else {
++		return 0;
++	}
++}
++
++static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
++			      struct btree_insert_entry *btree_id_start)
++{
++	struct btree_insert_entry *i;
++	bool trans_trigger_run;
++	int ret, overwrite;
++
++	for (overwrite = 1; overwrite >= 0; --overwrite) {
++
++		/*
++		 * Running triggers will append more updates to the list of updates as
++		 * we're walking it:
++		 */
++		do {
++			trans_trigger_run = false;
++
++			for (i = btree_id_start;
++			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
++			     i++) {
++				if (i->btree_id != btree_id)
++					continue;
++
++				ret = run_one_trans_trigger(trans, i, overwrite);
++				if (ret < 0)
++					return ret;
++				if (ret)
++					trans_trigger_run = true;
++			}
++		} while (trans_trigger_run);
++	}
++
++	return 0;
++}
++
++static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
++{
++	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
++	unsigned btree_id = 0;
++	int ret = 0;
++
++	/*
++	 *
++	 * For a given btree, this algorithm runs insert triggers before
++	 * overwrite triggers: this is so that when extents are being moved
++	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
++	 * they are re-added.
++	 */
++	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
++		if (btree_id == BTREE_ID_alloc)
++			continue;
++
++		while (btree_id_start < trans->updates + trans->nr_updates &&
++		       btree_id_start->btree_id < btree_id)
++			btree_id_start++;
++
++		ret = run_btree_triggers(trans, btree_id, btree_id_start);
++		if (ret)
++			return ret;
++	}
++
++	trans_for_each_update(trans, i) {
++		if (i->btree_id > BTREE_ID_alloc)
++			break;
++		if (i->btree_id == BTREE_ID_alloc) {
++			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
++			if (ret)
++				return ret;
++			break;
++		}
++	}
++
++	trans_for_each_update(trans, i)
++		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
++		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
++		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
++
++	return 0;
++}
++
++static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
++	int ret = 0;
++
++	trans_for_each_update(trans, i) {
++		/*
++		 * XXX: synchronization of cached update triggers with gc
++		 * XXX: synchronization of interior node updates with gc
++		 */
++		BUG_ON(i->cached || i->level);
++
++		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
++			ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
++			if (ret)
++				break;
++		}
++	}
++
++	return ret;
++}
++
++static inline int
++bch2_trans_commit_write_locked(struct btree_trans *trans,
++			       struct btree_insert_entry **stopped_at,
++			       unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
++	struct btree_trans_commit_hook *h;
++	unsigned u64s = 0;
++	bool marking = false;
++	int ret;
++
++	if (race_fault()) {
++		trace_trans_restart_fault_inject(trans->fn, trace_ip);
++		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
++	}
++
++	/*
++	 * Check if the insert will fit in the leaf node with the write lock
++	 * held, otherwise another thread could write the node changing the
++	 * amount of space available:
++	 */
++
++	prefetch(&trans->c->journal.flags);
++
++	h = trans->hooks;
++	while (h) {
++		ret = h->fn(trans, h);
++		if (ret)
++			return ret;
++		h = h->next;
++	}
++
++	trans_for_each_update(trans, i) {
++		/* Multiple inserts might go to same leaf: */
++		if (!same_leaf_as_prev(trans, i))
++			u64s = 0;
++
++		u64s += i->k->k.u64s;
++		ret = !i->cached
++			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
++			: btree_key_can_insert_cached(trans, i->path, u64s);
++		if (ret) {
++			*stopped_at = i;
++			return ret;
++		}
++
++		if (btree_node_type_needs_gc(i->bkey_type))
++			marking = true;
++
++		/*
++		 * Revalidate before calling mem triggers - XXX, ugly:
++		 *
++		 * - successful btree node splits don't cause transaction
++		 *   restarts and will have invalidated the pointer to the bkey
++		 *   value
++		 * - btree_node_lock_for_insert() -> btree_node_prep_for_write()
++		 *   when it has to resort
++		 * - btree_key_can_insert_cached() when it has to reallocate
++		 *
++		 *   Ugly because we currently have no way to tell if the
++		 *   pointer's been invalidated, which means it's debatabale
++		 *   whether we should be stashing the old key at all.
++		 */
++		i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
++
++		if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
++			struct bkey_i *j_k =
++				bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
++							    i->k->k.p);
++
++			if (j_k) {
++				i->old_k = j_k->k;
++				i->old_v = &j_k->v;
++			}
++		}
++	}
++
++	/*
++	 * Don't get journal reservation until after we know insert will
++	 * succeed:
++	 */
++	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
++		ret = bch2_trans_journal_res_get(trans,
++				JOURNAL_RES_GET_NONBLOCK);
++		if (ret)
++			return ret;
++
++		journal_transaction_name(trans);
++	} else {
++		trans->journal_res.seq = c->journal.replay_journal_seq;
++	}
++
++	if (unlikely(trans->extra_journal_entries.nr)) {
++		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
++				  trans->extra_journal_entries.data,
++				  trans->extra_journal_entries.nr);
++
++		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
++		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
++	}
++
++	/*
++	 * Not allowed to fail after we've gotten our journal reservation - we
++	 * have to use it:
++	 */
++
++	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
++		if (bch2_journal_seq_verify)
++			trans_for_each_update(trans, i)
++				i->k->k.version.lo = trans->journal_res.seq;
++		else if (bch2_inject_invalid_keys)
++			trans_for_each_update(trans, i)
++				i->k->k.version = MAX_VERSION;
++	}
++
++	if (trans->fs_usage_deltas &&
++	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
++		return BTREE_INSERT_NEED_MARK_REPLICAS;
++
++	trans_for_each_update(trans, i)
++		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
++			ret = run_one_mem_trigger(trans, i, i->flags);
++			if (ret)
++				return ret;
++		}
++
++	if (unlikely(c->gc_pos.phase)) {
++		ret = bch2_trans_commit_run_gc_triggers(trans);
++		if  (ret)
++			return ret;
++	}
++
++	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
++		trans_for_each_update(trans, i) {
++			struct journal *j = &c->journal;
++			struct jset_entry *entry;
++
++			if (i->key_cache_already_flushed)
++				continue;
++
++			entry = bch2_journal_add_entry(j, &trans->journal_res,
++					       BCH_JSET_ENTRY_overwrite,
++					       i->btree_id, i->level,
++					       i->old_k.u64s);
++			bkey_reassemble(&entry->start[0],
++					(struct bkey_s_c) { &i->old_k, i->old_v });
++
++			entry = bch2_journal_add_entry(j, &trans->journal_res,
++					       BCH_JSET_ENTRY_btree_keys,
++					       i->btree_id, i->level,
++					       i->k->k.u64s);
++			bkey_copy(&entry->start[0], i->k);
++		}
++
++		if (trans->journal_seq)
++			*trans->journal_seq = trans->journal_res.seq;
++	}
++
++	trans_for_each_update(trans, i) {
++		i->k->k.needs_whiteout = false;
++
++		if (!i->cached)
++			btree_insert_key_leaf(trans, i);
++		else if (!i->key_cache_already_flushed)
++			bch2_btree_insert_key_cached(trans, i->path, i->k);
++		else
++			bch2_btree_key_cache_drop(trans, i->path);
++	}
++
++	return ret;
++}
++
++static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path)
++{
++	unsigned l;
++
++	for (l = 0; l < BTREE_MAX_DEPTH; l++)
++		if (btree_node_read_locked(path, l))
++			BUG_ON(!bch2_btree_node_upgrade(trans, path, l));
++}
++
++static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
++{
++	struct btree *b = path_l(path)->b;
++
++	do {
++		if (path->nodes_locked &&
++		    path->nodes_locked != path->nodes_intent_locked)
++			path_upgrade_readers(trans, path);
++	} while ((path = prev_btree_path(trans, path)) &&
++		 path_l(path)->b == b);
++}
++
++/*
++ * Check for nodes that we have both read and intent locks on, and upgrade the
++ * readers to intent:
++ */
++static inline void normalize_read_intent_locks(struct btree_trans *trans)
++{
++	struct btree_path *path;
++	unsigned i, nr_read = 0, nr_intent = 0;
++
++	trans_for_each_path_inorder(trans, path, i) {
++		struct btree_path *next = i + 1 < trans->nr_sorted
++			? trans->paths + trans->sorted[i + 1]
++			: NULL;
++
++		if (path->nodes_locked) {
++			if (path->nodes_intent_locked)
++				nr_intent++;
++			else
++				nr_read++;
++		}
++
++		if (!next || path_l(path)->b != path_l(next)->b) {
++			if (nr_read && nr_intent)
++				upgrade_readers(trans, path);
++
++			nr_read = nr_intent = 0;
++		}
++	}
++
++	bch2_trans_verify_locks(trans);
++}
++
++static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
++{
++	struct btree_path *path;
++	unsigned i;
++
++	trans_for_each_path_inorder(trans, path, i) {
++		//if (path == pos)
++		//	break;
++
++		if (path->nodes_locked != path->nodes_intent_locked &&
++		    !bch2_btree_path_upgrade(trans, path, path->level + 1))
++			return true;
++	}
++
++	return false;
++}
++
++static inline int trans_lock_write(struct btree_trans *trans)
++{
++	struct btree_insert_entry *i;
++	int ret;
++
++	trans_for_each_update(trans, i) {
++		if (same_leaf_as_prev(trans, i))
++			continue;
++
++		if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
++			if (have_conflicting_read_lock(trans, i->path))
++				goto fail;
++
++			ret = btree_node_lock_type(trans, i->path,
++					     insert_l(i)->b,
++					     i->path->pos, i->level,
++					     SIX_LOCK_write, NULL, NULL);
++			BUG_ON(ret);
++		}
++
++		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
++	}
++
++	return 0;
++fail:
++	while (--i >= trans->updates) {
++		if (same_leaf_as_prev(trans, i))
++			continue;
++
++		bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
++	}
++
++	trace_trans_restart_would_deadlock_write(trans->fn);
++	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
++}
++
++static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
++{
++	struct btree_insert_entry *i;
++
++	trans_for_each_update(trans, i)
++		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
++}
++
++/*
++ * Get journal reservation, take write locks, and attempt to do btree update(s):
++ */
++static inline int do_bch2_trans_commit(struct btree_trans *trans,
++				       struct btree_insert_entry **stopped_at,
++				       unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
++	struct printbuf buf = PRINTBUF;
++	int ret, u64s_delta = 0;
++	int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
++
++	trans_for_each_update(trans, i) {
++		if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
++				      i->bkey_type, rw, &buf)) {
++			printbuf_reset(&buf);
++			prt_printf(&buf, "invalid bkey on insert from %s -> %ps",
++			       trans->fn, (void *) i->ip_allocated);
++			prt_newline(&buf);
++			printbuf_indent_add(&buf, 2);
++
++			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
++			prt_newline(&buf);
++
++			bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
++					  i->bkey_type, rw, &buf);
++
++			bch2_trans_inconsistent(trans, "%s", buf.buf);
++			printbuf_exit(&buf);
++			return -EINVAL;
++		}
++		btree_insert_entry_checks(trans, i);
++	}
++
++	printbuf_exit(&buf);
++
++	trans_for_each_update(trans, i) {
++		if (i->cached)
++			continue;
++
++		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
++		u64s_delta -= i->old_btree_u64s;
++
++		if (!same_leaf_as_next(trans, i)) {
++			if (u64s_delta <= 0) {
++				ret = bch2_foreground_maybe_merge(trans, i->path,
++							i->level, trans->flags);
++				if (unlikely(ret))
++					return ret;
++			}
++
++			u64s_delta = 0;
++		}
++	}
++
++	ret = bch2_journal_preres_get(&c->journal,
++			&trans->journal_preres, trans->journal_preres_u64s,
++			JOURNAL_RES_GET_NONBLOCK|
++			(trans->flags & JOURNAL_WATERMARK_MASK));
++	if (unlikely(ret == -EAGAIN))
++		ret = bch2_trans_journal_preres_get_cold(trans,
++						trans->journal_preres_u64s, trace_ip);
++	if (unlikely(ret))
++		return ret;
++
++	normalize_read_intent_locks(trans);
++
++	ret = trans_lock_write(trans);
++	if (unlikely(ret))
++		return ret;
++
++	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
++
++	if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
++		bch2_drop_overwrites_from_journal(trans);
++
++	trans_for_each_update(trans, i)
++		if (!same_leaf_as_prev(trans, i))
++			bch2_btree_node_unlock_write_inlined(trans, i->path,
++							insert_l(i)->b);
++
++	if (!ret && trans->journal_pin)
++		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
++				     trans->journal_pin, NULL);
++
++	/*
++	 * Drop journal reservation after dropping write locks, since dropping
++	 * the journal reservation may kick off a journal write:
++	 */
++	bch2_journal_res_put(&c->journal, &trans->journal_res);
++
++	if (unlikely(ret))
++		return ret;
++
++	bch2_trans_downgrade(trans);
++
++	return 0;
++}
++
++static int journal_reclaim_wait_done(struct bch_fs *c)
++{
++	int ret = bch2_journal_error(&c->journal) ?:
++		!bch2_btree_key_cache_must_wait(c);
++
++	if (!ret)
++		journal_reclaim_kick(&c->journal);
++	return ret;
++}
++
++static noinline
++int bch2_trans_commit_error(struct btree_trans *trans,
++			    struct btree_insert_entry *i,
++			    int ret, unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++
++	switch (ret) {
++	case BTREE_INSERT_BTREE_NODE_FULL:
++		ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			trace_trans_restart_btree_node_split(trans->fn, trace_ip,
++						i->btree_id, &i->path->pos);
++		break;
++	case BTREE_INSERT_NEED_MARK_REPLICAS:
++		bch2_trans_unlock(trans);
++
++		ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
++		if (ret)
++			break;
++
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			trace_trans_restart_mark_replicas(trans->fn, trace_ip);
++		break;
++	case BTREE_INSERT_NEED_JOURNAL_RES:
++		bch2_trans_unlock(trans);
++
++		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
++		    !(trans->flags & JOURNAL_WATERMARK_reserved)) {
++			ret = -BCH_ERR_journal_reclaim_would_deadlock;
++			break;
++		}
++
++		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
++		if (ret)
++			break;
++
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			trace_trans_restart_journal_res_get(trans->fn, trace_ip);
++		break;
++	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
++		bch2_trans_unlock(trans);
++
++		trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
++
++		wait_event_freezable(c->journal.reclaim_wait,
++				     (ret = journal_reclaim_wait_done(c)));
++		if (ret < 0)
++			break;
++
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
++		break;
++	default:
++		BUG_ON(ret >= 0);
++		break;
++	}
++
++	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
++	BUG_ON(ret == -ENOSPC &&
++	       !(trans->flags & BTREE_INSERT_NOWAIT) &&
++	       (trans->flags & BTREE_INSERT_NOFAIL));
++
++	return ret;
++}
++
++static noinline int
++bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	int ret;
++
++	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
++	    test_bit(BCH_FS_STARTED, &c->flags))
++		return -EROFS;
++
++	bch2_trans_unlock(trans);
++
++	ret =   bch2_fs_read_write_early(c) ?:
++		bch2_trans_relock(trans);
++	if (ret)
++		return ret;
++
++	percpu_ref_get(&c->writes);
++	return 0;
++}
++
++/*
++ * This is for updates done in the early part of fsck - btree_gc - before we've
++ * gone RW. we only add the new key to the list of keys for journal replay to
++ * do.
++ */
++static noinline int
++do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
++	int ret = 0;
++
++	trans_for_each_update(trans, i) {
++		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
++		if (ret)
++			break;
++	}
++
++	return ret;
++}
++
++int __bch2_trans_commit(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i = NULL;
++	unsigned u64s;
++	int ret = 0;
++
++	if (!trans->nr_updates &&
++	    !trans->extra_journal_entries.nr)
++		goto out_reset;
++
++	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
++		lockdep_assert_held(&c->gc_lock);
++
++	ret = bch2_trans_commit_run_triggers(trans);
++	if (ret)
++		goto out_reset;
++
++	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
++		ret = do_bch2_trans_commit_to_journal_replay(trans);
++		goto out_reset;
++	}
++
++	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
++	    unlikely(!percpu_ref_tryget_live(&c->writes))) {
++		ret = bch2_trans_commit_get_rw_cold(trans);
++		if (ret)
++			goto out_reset;
++	}
++
++	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
++
++	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
++
++	trans->journal_u64s		= trans->extra_journal_entries.nr;
++	trans->journal_preres_u64s	= 0;
++
++	/* For journalling transaction name: */
++	trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
++
++	trans_for_each_update(trans, i) {
++		BUG_ON(!i->path->should_be_locked);
++
++		if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
++			trace_trans_restart_upgrade(trans->fn, _RET_IP_,
++						    i->btree_id, &i->path->pos);
++			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
++			goto out;
++		}
++
++		BUG_ON(!btree_node_intent_locked(i->path, i->level));
++
++		if (i->key_cache_already_flushed)
++			continue;
++
++		/* we're going to journal the key being updated: */
++		u64s = jset_u64s(i->k->k.u64s);
++		if (i->cached &&
++		    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
++			trans->journal_preres_u64s += u64s;
++		trans->journal_u64s += u64s;
++
++		/* and we're also going to log the overwrite: */
++		trans->journal_u64s += jset_u64s(i->old_k.u64s);
++	}
++
++	if (trans->extra_journal_res) {
++		ret = bch2_disk_reservation_add(c, trans->disk_res,
++				trans->extra_journal_res,
++				(trans->flags & BTREE_INSERT_NOFAIL)
++				? BCH_DISK_RESERVATION_NOFAIL : 0);
++		if (ret)
++			goto err;
++	}
++retry:
++	BUG_ON(trans->restarted);
++	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
++
++	ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
++
++	/* make sure we didn't drop or screw up locks: */
++	bch2_trans_verify_locks(trans);
++
++	if (ret)
++		goto err;
++
++	trace_transaction_commit(trans->fn, _RET_IP_);
++out:
++	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
++
++	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
++		percpu_ref_put(&c->writes);
++out_reset:
++	bch2_trans_reset_updates(trans);
++
++	if (trans->fs_usage_deltas) {
++		trans->fs_usage_deltas->used = 0;
++		memset((void *) trans->fs_usage_deltas +
++		       offsetof(struct replicas_delta_list, memset_start), 0,
++		       (void *) &trans->fs_usage_deltas->memset_end -
++		       (void *) &trans->fs_usage_deltas->memset_start);
++	}
++
++	return ret;
++err:
++	ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_);
++	if (ret)
++		goto out;
++
++	goto retry;
++}
++
++static int check_pos_snapshot_overwritten(struct btree_trans *trans,
++					  enum btree_id id,
++					  struct bpos pos)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	if (!btree_type_has_snapshots(id))
++		return 0;
++
++	if (!snapshot_t(c, pos.snapshot)->children[0])
++		return 0;
++
++	bch2_trans_iter_init(trans, &iter, id, pos,
++			     BTREE_ITER_NOT_EXTENTS|
++			     BTREE_ITER_ALL_SNAPSHOTS);
++	while (1) {
++		k = bch2_btree_iter_prev(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			break;
++
++		if (!k.k)
++			break;
++
++		if (bkey_cmp(pos, k.k->p))
++			break;
++
++		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
++			ret = 1;
++			break;
++		}
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++int bch2_trans_update_extent(struct btree_trans *trans,
++			     struct btree_iter *orig_iter,
++			     struct bkey_i *insert,
++			     enum btree_update_flags flags)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter, update_iter;
++	struct bpos start = bkey_start_pos(&insert->k);
++	struct bkey_i *update;
++	struct bkey_s_c k;
++	enum btree_id btree_id = orig_iter->btree_id;
++	int ret = 0, compressed_sectors;
++
++	bch2_trans_iter_init(trans, &iter, btree_id, start,
++			     BTREE_ITER_INTENT|
++			     BTREE_ITER_WITH_UPDATES|
++			     BTREE_ITER_NOT_EXTENTS);
++	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
++	if ((ret = bkey_err(k)))
++		goto err;
++	if (!k.k)
++		goto out;
++
++	if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
++		/*
++		 * We can't merge extents if they belong to interior snapshot
++		 * tree nodes, and there's a snapshot in which one extent is
++		 * visible and the other is not - i.e. if visibility is
++		 * different.
++		 *
++		 * Instead of checking if visibilitiy of the two extents is
++		 * different, for now we just check if either has been
++		 * overwritten:
++		 */
++		ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
++		if (ret < 0)
++			goto err;
++		if (ret)
++			goto nomerge1;
++
++		ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
++		if (ret < 0)
++			goto err;
++		if (ret)
++			goto nomerge1;
++
++		update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++		if ((ret = PTR_ERR_OR_ZERO(update)))
++			goto err;
++
++		bkey_reassemble(update, k);
++
++		if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) {
++			ret = bch2_btree_delete_at(trans, &iter, flags);
++			if (ret)
++				goto err;
++
++			insert = update;
++			goto next;
++		}
++	}
++nomerge1:
++	ret = 0;
++	if (!bkey_cmp(k.k->p, start))
++		goto next;
++
++	while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
++		bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
++		bool back_split  = bkey_cmp(k.k->p, insert->k.p) > 0;
++
++		/*
++		 * If we're going to be splitting a compressed extent, note it
++		 * so that __bch2_trans_commit() can increase our disk
++		 * reservation:
++		 */
++		if (((front_split && back_split) ||
++		     ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
++		    (compressed_sectors = bch2_bkey_sectors_compressed(k)))
++			trans->extra_journal_res += compressed_sectors;
++
++		if (front_split) {
++			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++			if ((ret = PTR_ERR_OR_ZERO(update)))
++				goto err;
++
++			bkey_reassemble(update, k);
++
++			bch2_cut_back(start, update);
++
++			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
++					     BTREE_ITER_NOT_EXTENTS|
++					     BTREE_ITER_ALL_SNAPSHOTS|
++					     BTREE_ITER_INTENT);
++			ret   = bch2_btree_iter_traverse(&update_iter) ?:
++				bch2_trans_update(trans, &update_iter, update,
++						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
++						  flags);
++			bch2_trans_iter_exit(trans, &update_iter);
++
++			if (ret)
++				goto err;
++		}
++
++		if (k.k->p.snapshot != insert->k.p.snapshot &&
++		    (front_split || back_split)) {
++			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++			if ((ret = PTR_ERR_OR_ZERO(update)))
++				goto err;
++
++			bkey_reassemble(update, k);
++
++			bch2_cut_front(start, update);
++			bch2_cut_back(insert->k.p, update);
++
++			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
++					     BTREE_ITER_NOT_EXTENTS|
++					     BTREE_ITER_ALL_SNAPSHOTS|
++					     BTREE_ITER_INTENT);
++			ret   = bch2_btree_iter_traverse(&update_iter) ?:
++				bch2_trans_update(trans, &update_iter, update,
++						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
++						  flags);
++			bch2_trans_iter_exit(trans, &update_iter);
++			if (ret)
++				goto err;
++		}
++
++		if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
++			update = bch2_trans_kmalloc(trans, sizeof(*update));
++			if ((ret = PTR_ERR_OR_ZERO(update)))
++				goto err;
++
++			bkey_init(&update->k);
++			update->k.p = k.k->p;
++
++			if (insert->k.p.snapshot != k.k->p.snapshot) {
++				update->k.p.snapshot = insert->k.p.snapshot;
++				update->k.type = KEY_TYPE_whiteout;
++			}
++
++			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
++					     BTREE_ITER_NOT_EXTENTS|
++					     BTREE_ITER_INTENT);
++			ret   = bch2_btree_iter_traverse(&update_iter) ?:
++				bch2_trans_update(trans, &update_iter, update,
++						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
++						  flags);
++			bch2_trans_iter_exit(trans, &update_iter);
++
++			if (ret)
++				goto err;
++		}
++
++		if (back_split) {
++			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++			if ((ret = PTR_ERR_OR_ZERO(update)))
++				goto err;
++
++			bkey_reassemble(update, k);
++			bch2_cut_front(insert->k.p, update);
++
++			ret = bch2_trans_update_by_path(trans, iter.path, update,
++						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
++						  flags);
++			if (ret)
++				goto err;
++			goto out;
++		}
++next:
++		bch2_btree_iter_advance(&iter);
++		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
++		if ((ret = bkey_err(k)))
++			goto err;
++		if (!k.k)
++			goto out;
++	}
++
++	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
++		ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
++		if (ret < 0)
++			goto err;
++		if (ret)
++			goto nomerge2;
++
++		ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
++		if (ret < 0)
++			goto err;
++		if (ret)
++			goto nomerge2;
++
++		bch2_bkey_merge(c, bkey_i_to_s(insert), k);
++	}
++nomerge2:
++	ret = 0;
++out:
++	if (!bkey_deleted(&insert->k)) {
++		/*
++		 * Rewinding iterators is expensive: get a new one and the one
++		 * that points to the start of insert will be cloned from:
++		 */
++		bch2_trans_iter_exit(trans, &iter);
++		bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p,
++				     BTREE_ITER_NOT_EXTENTS|
++				     BTREE_ITER_INTENT);
++		ret   = bch2_btree_iter_traverse(&iter) ?:
++			bch2_trans_update(trans, &iter, insert, flags);
++	}
++err:
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++/*
++ * When deleting, check if we need to emit a whiteout (because we're overwriting
++ * something in an ancestor snapshot)
++ */
++static int need_whiteout_for_snapshot(struct btree_trans *trans,
++				      enum btree_id btree_id, struct bpos pos)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u32 snapshot = pos.snapshot;
++	int ret;
++
++	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
++		return 0;
++
++	pos.snapshot++;
++
++	for_each_btree_key_norestart(trans, iter, btree_id, pos,
++			   BTREE_ITER_ALL_SNAPSHOTS|
++			   BTREE_ITER_NOPRESERVE, k, ret) {
++		if (bkey_cmp(k.k->p, pos))
++			break;
++
++		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
++					      k.k->p.snapshot)) {
++			ret = !bkey_whiteout(k.k);
++			break;
++		}
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++static int __must_check
++bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
++				struct bkey_i *k, enum btree_update_flags flags,
++				unsigned long ip)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i, n;
++	int ret = 0;
++
++	BUG_ON(!path->should_be_locked);
++
++	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
++	BUG_ON(bpos_cmp(k->k.p, path->pos));
++
++	n = (struct btree_insert_entry) {
++		.flags		= flags,
++		.bkey_type	= __btree_node_type(path->level, path->btree_id),
++		.btree_id	= path->btree_id,
++		.level		= path->level,
++		.cached		= path->cached,
++		.path		= path,
++		.k		= k,
++		.ip_allocated	= ip,
++	};
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++	trans_for_each_update(trans, i)
++		BUG_ON(i != trans->updates &&
++		       btree_insert_entry_cmp(i - 1, i) >= 0);
++#endif
++
++	/*
++	 * Pending updates are kept sorted: first, find position of new update,
++	 * then delete/trim any updates the new update overwrites:
++	 */
++	trans_for_each_update(trans, i)
++		if (btree_insert_entry_cmp(&n, i) <= 0)
++			break;
++
++	if (i < trans->updates + trans->nr_updates &&
++	    !btree_insert_entry_cmp(&n, i)) {
++		BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
++
++		bch2_path_put(trans, i->path, true);
++		i->flags	= n.flags;
++		i->cached	= n.cached;
++		i->k		= n.k;
++		i->path		= n.path;
++		i->ip_allocated	= n.ip_allocated;
++	} else {
++		array_insert_item(trans->updates, trans->nr_updates,
++				  i - trans->updates, n);
++
++		i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v;
++		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
++
++		if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
++			struct bkey_i *j_k =
++				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
++
++			if (j_k) {
++				i->old_k = j_k->k;
++				i->old_v = &j_k->v;
++			}
++		}
++	}
++
++	__btree_path_get(i->path, true);
++
++	/*
++	 * If a key is present in the key cache, it must also exist in the
++	 * btree - this is necessary for cache coherency. When iterating over
++	 * a btree that's cached in the key cache, the btree iter code checks
++	 * the key cache - but the key has to exist in the btree for that to
++	 * work:
++	 */
++	if (path->cached &&
++	    bkey_deleted(&i->old_k)) {
++		struct btree_path *btree_path;
++
++		i->key_cache_already_flushed = true;
++		i->flags |= BTREE_TRIGGER_NORUN;
++
++		btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
++					   BTREE_ITER_INTENT, _THIS_IP_);
++
++		ret = bch2_btree_path_traverse(trans, btree_path, 0);
++		if (ret)
++			goto err;
++
++		btree_path->should_be_locked = true;
++		ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip);
++err:
++		bch2_path_put(trans, btree_path, true);
++	}
++
++	return ret;
++}
++
++static int __must_check
++bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
++			  struct bkey_i *k, enum btree_update_flags flags)
++{
++	return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_);
++}
++
++int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
++				   struct bkey_i *k, enum btree_update_flags flags)
++{
++	struct btree_path *path = iter->update_path ?: iter->path;
++	struct bkey_cached *ck;
++	int ret;
++
++	if (iter->flags & BTREE_ITER_IS_EXTENTS)
++		return bch2_trans_update_extent(trans, iter, k, flags);
++
++	if (bkey_deleted(&k->k) &&
++	    !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
++	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
++		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
++		if (unlikely(ret < 0))
++			return ret;
++
++		if (ret)
++			k->k.type = KEY_TYPE_whiteout;
++	}
++
++	/*
++	 * Ensure that updates to cached btrees go to the key cache:
++	 */
++	if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
++	    !path->cached &&
++	    !path->level &&
++	    btree_id_cached(trans->c, path->btree_id)) {
++		if (!iter->key_cache_path ||
++		    !iter->key_cache_path->should_be_locked ||
++		    bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
++			if (!iter->key_cache_path)
++				iter->key_cache_path =
++					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
++						      BTREE_ITER_INTENT|
++						      BTREE_ITER_CACHED, _THIS_IP_);
++
++			iter->key_cache_path =
++				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
++							iter->flags & BTREE_ITER_INTENT,
++							_THIS_IP_);
++
++			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
++						       BTREE_ITER_CACHED);
++			if (unlikely(ret))
++				return ret;
++
++			ck = (void *) iter->key_cache_path->l[0].b;
++
++			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
++				trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
++				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
++			}
++
++			iter->key_cache_path->should_be_locked = true;
++		}
++
++		path = iter->key_cache_path;
++	}
++
++	return bch2_trans_update_by_path(trans, path, k, flags);
++}
++
++void bch2_trans_commit_hook(struct btree_trans *trans,
++			    struct btree_trans_commit_hook *h)
++{
++	h->next = trans->hooks;
++	trans->hooks = h;
++}
++
++int __bch2_btree_insert(struct btree_trans *trans,
++			enum btree_id id, struct bkey_i *k)
++{
++	struct btree_iter iter;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
++			     BTREE_ITER_INTENT);
++	ret   = bch2_btree_iter_traverse(&iter) ?:
++		bch2_trans_update(trans, &iter, k, 0);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++/**
++ * bch2_btree_insert - insert keys into the extent btree
++ * @c:			pointer to struct bch_fs
++ * @id:			btree to insert into
++ * @insert_keys:	list of keys to insert
++ * @hook:		insert callback
++ */
++int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
++		      struct bkey_i *k,
++		      struct disk_reservation *disk_res,
++		      u64 *journal_seq, int flags)
++{
++	return bch2_trans_do(c, disk_res, journal_seq, flags,
++			     __bch2_btree_insert(&trans, id, k));
++}
++
++int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
++				unsigned len, unsigned update_flags)
++{
++	struct bkey_i *k;
++
++	k = bch2_trans_kmalloc(trans, sizeof(*k));
++	if (IS_ERR(k))
++		return PTR_ERR(k);
++
++	bkey_init(&k->k);
++	k->k.p = iter->pos;
++	bch2_key_resize(&k->k, len);
++	return bch2_trans_update(trans, iter, k, update_flags);
++}
++
++int bch2_btree_delete_at(struct btree_trans *trans,
++			 struct btree_iter *iter, unsigned update_flags)
++{
++	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
++}
++
++int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
++				  struct bpos start, struct bpos end,
++				  unsigned update_flags,
++				  u64 *journal_seq)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
++retry:
++	while ((bch2_trans_begin(trans),
++	       (k = bch2_btree_iter_peek(&iter)).k) &&
++	       !(ret = bkey_err(k)) &&
++	       bkey_cmp(iter.pos, end) < 0) {
++		struct disk_reservation disk_res =
++			bch2_disk_reservation_init(trans->c, 0);
++		struct bkey_i delete;
++
++		bkey_init(&delete.k);
++
++		/*
++		 * This could probably be more efficient for extents:
++		 */
++
++		/*
++		 * For extents, iter.pos won't necessarily be the same as
++		 * bkey_start_pos(k.k) (for non extents they always will be the
++		 * same). It's important that we delete starting from iter.pos
++		 * because the range we want to delete could start in the middle
++		 * of k.
++		 *
++		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
++		 * bkey_start_pos(k.k)).
++		 */
++		delete.k.p = iter.pos;
++
++		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
++			unsigned max_sectors =
++				KEY_SIZE_MAX & (~0 << trans->c->block_bits);
++
++			/* create the biggest key we can */
++			bch2_key_resize(&delete.k, max_sectors);
++			bch2_cut_back(end, &delete);
++
++			ret = bch2_extent_trim_atomic(trans, &iter, &delete);
++			if (ret)
++				break;
++		}
++
++		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
++			bch2_trans_commit(trans, &disk_res, journal_seq,
++					  BTREE_INSERT_NOFAIL);
++		bch2_disk_reservation_put(trans->c, &disk_res);
++		if (ret)
++			break;
++	}
++
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
++		ret = 0;
++		goto retry;
++	}
++
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++/*
++ * bch_btree_delete_range - delete everything within a given range
++ *
++ * Range is a half open interval - [start, end)
++ */
++int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
++			    struct bpos start, struct bpos end,
++			    unsigned update_flags,
++			    u64 *journal_seq)
++{
++	return bch2_trans_do(c, NULL, journal_seq, 0,
++			     bch2_btree_delete_range_trans(&trans, id, start, end,
++							   update_flags, journal_seq));
++}
++
++int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
++{
++	unsigned len = strlen(msg);
++	unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
++	struct jset_entry_log *l;
++	int ret;
++
++	ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s));
++	if (ret)
++		return ret;
++
++	l = (void *) &darray_top(trans->extra_journal_entries);
++	l->entry.u64s		= cpu_to_le16(u64s);
++	l->entry.btree_id	= 0;
++	l->entry.level		= 1;
++	l->entry.type		= BCH_JSET_ENTRY_log;
++	l->entry.pad[0]		= 0;
++	l->entry.pad[1]		= 0;
++	l->entry.pad[2]		= 0;
++	memcpy(l->d, msg, len);
++	while (len & 7)
++		l->d[len++] = '\0';
++
++	trans->extra_journal_entries.nr += jset_u64s(u64s);
++	return 0;
++}
+diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
+new file mode 100644
+index 000000000000..b4be2122c2d5
+--- /dev/null
++++ b/fs/bcachefs/buckets.c
+@@ -0,0 +1,2113 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Code for manipulating bucket marks for garbage collection.
++ *
++ * Copyright 2014 Datera, Inc.
++ */
++
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "backpointers.h"
++#include "bset.h"
++#include "btree_gc.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "buckets_waiting_for_journal.h"
++#include "ec.h"
++#include "error.h"
++#include "inode.h"
++#include "movinggc.h"
++#include "recovery.h"
++#include "reflink.h"
++#include "replicas.h"
++#include "subvolume.h"
++
++#include <linux/preempt.h>
++#include <trace/events/bcachefs.h>
++
++static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
++					      enum bch_data_type data_type,
++					      s64 sectors)
++{
++	switch (data_type) {
++	case BCH_DATA_btree:
++		fs_usage->btree		+= sectors;
++		break;
++	case BCH_DATA_user:
++	case BCH_DATA_parity:
++		fs_usage->data		+= sectors;
++		break;
++	case BCH_DATA_cached:
++		fs_usage->cached	+= sectors;
++		break;
++	default:
++		break;
++	}
++}
++
++void bch2_fs_usage_initialize(struct bch_fs *c)
++{
++	struct bch_fs_usage *usage;
++	struct bch_dev *ca;
++	unsigned i;
++
++	percpu_down_write(&c->mark_lock);
++	usage = c->usage_base;
++
++	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
++		bch2_fs_usage_acc_to_base(c, i);
++
++	for (i = 0; i < BCH_REPLICAS_MAX; i++)
++		usage->reserved += usage->persistent_reserved[i];
++
++	for (i = 0; i < c->replicas.nr; i++) {
++		struct bch_replicas_entry *e =
++			cpu_replicas_entry(&c->replicas, i);
++
++		fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
++	}
++
++	for_each_member_device(ca, c, i) {
++		struct bch_dev_usage dev = bch2_dev_usage_read(ca);
++
++		usage->hidden += (dev.d[BCH_DATA_sb].buckets +
++				  dev.d[BCH_DATA_journal].buckets) *
++			ca->mi.bucket_size;
++	}
++
++	percpu_up_write(&c->mark_lock);
++}
++
++static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
++						  unsigned journal_seq,
++						  bool gc)
++{
++	BUG_ON(!gc && !journal_seq);
++
++	return this_cpu_ptr(gc
++			    ? ca->usage_gc
++			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
++}
++
++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
++{
++	struct bch_fs *c = ca->fs;
++	struct bch_dev_usage ret;
++	unsigned seq, i, u64s = dev_usage_u64s();
++
++	do {
++		seq = read_seqcount_begin(&c->usage_lock);
++		memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
++		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
++			acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
++	} while (read_seqcount_retry(&c->usage_lock, seq));
++
++	return ret;
++}
++
++static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
++						unsigned journal_seq,
++						bool gc)
++{
++	percpu_rwsem_assert_held(&c->mark_lock);
++	BUG_ON(!gc && !journal_seq);
++
++	return this_cpu_ptr(gc
++			    ? c->usage_gc
++			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
++}
++
++u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
++{
++	ssize_t offset = v - (u64 *) c->usage_base;
++	unsigned i, seq;
++	u64 ret;
++
++	BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
++	percpu_rwsem_assert_held(&c->mark_lock);
++
++	do {
++		seq = read_seqcount_begin(&c->usage_lock);
++		ret = *v;
++
++		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
++			ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
++	} while (read_seqcount_retry(&c->usage_lock, seq));
++
++	return ret;
++}
++
++struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
++{
++	struct bch_fs_usage_online *ret;
++	unsigned seq, i, u64s;
++
++	percpu_down_read(&c->mark_lock);
++
++	ret = kmalloc(sizeof(struct bch_fs_usage_online) +
++		      sizeof(u64) * c->replicas.nr, GFP_NOFS);
++	if (unlikely(!ret)) {
++		percpu_up_read(&c->mark_lock);
++		return NULL;
++	}
++
++	ret->online_reserved = percpu_u64_get(c->online_reserved);
++
++	u64s = fs_usage_u64s(c);
++	do {
++		seq = read_seqcount_begin(&c->usage_lock);
++		memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
++		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
++			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
++	} while (read_seqcount_retry(&c->usage_lock, seq));
++
++	return ret;
++}
++
++void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
++{
++	struct bch_dev *ca;
++	unsigned i, u64s = fs_usage_u64s(c);
++
++	BUG_ON(idx >= ARRAY_SIZE(c->usage));
++
++	preempt_disable();
++	write_seqcount_begin(&c->usage_lock);
++
++	acc_u64s_percpu((u64 *) c->usage_base,
++			(u64 __percpu *) c->usage[idx], u64s);
++	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
++
++	rcu_read_lock();
++	for_each_member_device_rcu(ca, c, i, NULL) {
++		u64s = dev_usage_u64s();
++
++		acc_u64s_percpu((u64 *) ca->usage_base,
++				(u64 __percpu *) ca->usage[idx], u64s);
++		percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
++	}
++	rcu_read_unlock();
++
++	write_seqcount_end(&c->usage_lock);
++	preempt_enable();
++}
++
++void bch2_fs_usage_to_text(struct printbuf *out,
++			   struct bch_fs *c,
++			   struct bch_fs_usage_online *fs_usage)
++{
++	unsigned i;
++
++	prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
++
++	prt_printf(out, "hidden:\t\t\t\t%llu\n",
++	       fs_usage->u.hidden);
++	prt_printf(out, "data:\t\t\t\t%llu\n",
++	       fs_usage->u.data);
++	prt_printf(out, "cached:\t\t\t\t%llu\n",
++	       fs_usage->u.cached);
++	prt_printf(out, "reserved:\t\t\t%llu\n",
++	       fs_usage->u.reserved);
++	prt_printf(out, "nr_inodes:\t\t\t%llu\n",
++	       fs_usage->u.nr_inodes);
++	prt_printf(out, "online reserved:\t\t%llu\n",
++	       fs_usage->online_reserved);
++
++	for (i = 0;
++	     i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
++	     i++) {
++		prt_printf(out, "%u replicas:\n", i + 1);
++		prt_printf(out, "\treserved:\t\t%llu\n",
++		       fs_usage->u.persistent_reserved[i]);
++	}
++
++	for (i = 0; i < c->replicas.nr; i++) {
++		struct bch_replicas_entry *e =
++			cpu_replicas_entry(&c->replicas, i);
++
++		prt_printf(out, "\t");
++		bch2_replicas_entry_to_text(out, e);
++		prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
++	}
++}
++
++static u64 reserve_factor(u64 r)
++{
++	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
++}
++
++u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
++{
++	return min(fs_usage->u.hidden +
++		   fs_usage->u.btree +
++		   fs_usage->u.data +
++		   reserve_factor(fs_usage->u.reserved +
++				  fs_usage->online_reserved),
++		   c->capacity);
++}
++
++static struct bch_fs_usage_short
++__bch2_fs_usage_read_short(struct bch_fs *c)
++{
++	struct bch_fs_usage_short ret;
++	u64 data, reserved;
++
++	ret.capacity = c->capacity -
++		bch2_fs_usage_read_one(c, &c->usage_base->hidden);
++
++	data		= bch2_fs_usage_read_one(c, &c->usage_base->data) +
++		bch2_fs_usage_read_one(c, &c->usage_base->btree);
++	reserved	= bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
++		percpu_u64_get(c->online_reserved);
++
++	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
++	ret.free	= ret.capacity - ret.used;
++
++	ret.nr_inodes	= bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
++
++	return ret;
++}
++
++struct bch_fs_usage_short
++bch2_fs_usage_read_short(struct bch_fs *c)
++{
++	struct bch_fs_usage_short ret;
++
++	percpu_down_read(&c->mark_lock);
++	ret = __bch2_fs_usage_read_short(c);
++	percpu_up_read(&c->mark_lock);
++
++	return ret;
++}
++
++void bch2_dev_usage_init(struct bch_dev *ca)
++{
++	ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
++}
++
++static inline int bucket_sectors_fragmented(struct bch_dev *ca,
++					    struct bch_alloc_v4 a)
++{
++	return a.dirty_sectors
++		? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
++		: 0;
++}
++
++static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
++				  struct bch_alloc_v4 old,
++				  struct bch_alloc_v4 new,
++				  u64 journal_seq, bool gc)
++{
++	struct bch_fs_usage *fs_usage;
++	struct bch_dev_usage *u;
++
++	preempt_disable();
++	fs_usage = fs_usage_ptr(c, journal_seq, gc);
++
++	if (data_type_is_hidden(old.data_type))
++		fs_usage->hidden -= ca->mi.bucket_size;
++	if (data_type_is_hidden(new.data_type))
++		fs_usage->hidden += ca->mi.bucket_size;
++
++	u = dev_usage_ptr(ca, journal_seq, gc);
++
++	u->d[old.data_type].buckets--;
++	u->d[new.data_type].buckets++;
++
++	u->buckets_ec -= (int) !!old.stripe;
++	u->buckets_ec += (int) !!new.stripe;
++
++	u->d[old.data_type].sectors -= old.dirty_sectors;
++	u->d[new.data_type].sectors += new.dirty_sectors;
++
++	u->d[BCH_DATA_cached].sectors += new.cached_sectors;
++	u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
++
++	u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
++	u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
++
++	preempt_enable();
++}
++
++static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
++				    struct bucket old, struct bucket new,
++				    u64 journal_seq, bool gc)
++{
++	struct bch_alloc_v4 old_a = {
++		.gen		= old.gen,
++		.data_type	= old.data_type,
++		.dirty_sectors	= old.dirty_sectors,
++		.cached_sectors	= old.cached_sectors,
++		.stripe		= old.stripe,
++	};
++	struct bch_alloc_v4 new_a = {
++		.gen		= new.gen,
++		.data_type	= new.data_type,
++		.dirty_sectors	= new.dirty_sectors,
++		.cached_sectors	= new.cached_sectors,
++		.stripe		= new.stripe,
++	};
++
++	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
++}
++
++static inline int __update_replicas(struct bch_fs *c,
++				    struct bch_fs_usage *fs_usage,
++				    struct bch_replicas_entry *r,
++				    s64 sectors)
++{
++	int idx = bch2_replicas_entry_idx(c, r);
++
++	if (idx < 0)
++		return -1;
++
++	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
++	fs_usage->replicas[idx]		+= sectors;
++	return 0;
++}
++
++static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
++			struct bch_replicas_entry *r, s64 sectors,
++			unsigned journal_seq, bool gc)
++{
++	struct bch_fs_usage __percpu *fs_usage;
++	int idx, ret = 0;
++	struct printbuf buf = PRINTBUF;
++
++	percpu_down_read(&c->mark_lock);
++	buf.atomic++;
++
++	idx = bch2_replicas_entry_idx(c, r);
++	if (idx < 0 &&
++	    fsck_err(c, "no replicas entry\n"
++		     "  while marking %s",
++		     (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		percpu_up_read(&c->mark_lock);
++		ret = bch2_mark_replicas(c, r);
++		percpu_down_read(&c->mark_lock);
++
++		if (ret)
++			goto err;
++		idx = bch2_replicas_entry_idx(c, r);
++	}
++	if (idx < 0) {
++		ret = -1;
++		goto err;
++	}
++
++	preempt_disable();
++	fs_usage = fs_usage_ptr(c, journal_seq, gc);
++	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
++	fs_usage->replicas[idx]		+= sectors;
++	preempt_enable();
++err:
++fsck_err:
++	percpu_up_read(&c->mark_lock);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static inline int update_cached_sectors(struct bch_fs *c,
++			struct bkey_s_c k,
++			unsigned dev, s64 sectors,
++			unsigned journal_seq, bool gc)
++{
++	struct bch_replicas_padded r;
++
++	bch2_replicas_entry_cached(&r.e, dev);
++
++	return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
++}
++
++static struct replicas_delta_list *
++replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
++{
++	struct replicas_delta_list *d = trans->fs_usage_deltas;
++	unsigned new_size = d ? (d->size + more) * 2 : 128;
++	unsigned alloc_size = sizeof(*d) + new_size;
++
++	WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
++
++	if (!d || d->used + more > d->size) {
++		d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO);
++
++		BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX);
++
++		if (!d) {
++			d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO);
++			memset(d, 0, REPLICAS_DELTA_LIST_MAX);
++
++			if (trans->fs_usage_deltas)
++				memcpy(d, trans->fs_usage_deltas,
++				       trans->fs_usage_deltas->size + sizeof(*d));
++
++			new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
++			kfree(trans->fs_usage_deltas);
++		}
++
++		d->size = new_size;
++		trans->fs_usage_deltas = d;
++	}
++	return d;
++}
++
++static inline void update_replicas_list(struct btree_trans *trans,
++					struct bch_replicas_entry *r,
++					s64 sectors)
++{
++	struct replicas_delta_list *d;
++	struct replicas_delta *n;
++	unsigned b;
++
++	if (!sectors)
++		return;
++
++	b = replicas_entry_bytes(r) + 8;
++	d = replicas_deltas_realloc(trans, b);
++
++	n = (void *) d->d + d->used;
++	n->delta = sectors;
++	memcpy((void *) n + offsetof(struct replicas_delta, r),
++	       r, replicas_entry_bytes(r));
++	bch2_replicas_entry_sort(&n->r);
++	d->used += b;
++}
++
++static inline void update_cached_sectors_list(struct btree_trans *trans,
++					      unsigned dev, s64 sectors)
++{
++	struct bch_replicas_padded r;
++
++	bch2_replicas_entry_cached(&r.e, dev);
++
++	update_replicas_list(trans, &r.e, sectors);
++}
++
++int bch2_mark_alloc(struct btree_trans *trans,
++		    struct bkey_s_c old, struct bkey_s_c new,
++		    unsigned flags)
++{
++	bool gc = flags & BTREE_TRIGGER_GC;
++	u64 journal_seq = trans->journal_res.seq;
++	struct bch_fs *c = trans->c;
++	struct bch_alloc_v4 old_a, new_a;
++	struct bch_dev *ca;
++	int ret = 0;
++
++	/*
++	 * alloc btree is read in by bch2_alloc_read, not gc:
++	 */
++	if ((flags & BTREE_TRIGGER_GC) &&
++	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
++		return 0;
++
++	if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
++				       "alloc key for invalid device or bucket"))
++		return -EIO;
++
++	ca = bch_dev_bkey_exists(c, new.k->p.inode);
++
++	bch2_alloc_to_v4(old, &old_a);
++	bch2_alloc_to_v4(new, &new_a);
++
++	if ((flags & BTREE_TRIGGER_INSERT) &&
++	    data_type_is_empty(old_a.data_type) !=
++	    data_type_is_empty(new_a.data_type) &&
++	    new.k->type == KEY_TYPE_alloc_v4) {
++		struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
++
++		BUG_ON(!journal_seq);
++
++		/*
++		 * If the btree updates referring to a bucket weren't flushed
++		 * before the bucket became empty again, then the we don't have
++		 * to wait on a journal flush before we can reuse the bucket:
++		 */
++		new_a.journal_seq = data_type_is_empty(new_a.data_type) &&
++			(journal_seq == v->journal_seq ||
++			 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
++			? 0 : journal_seq;
++		v->journal_seq = new_a.journal_seq;
++	}
++
++	if (!data_type_is_empty(old_a.data_type) &&
++	    data_type_is_empty(new_a.data_type) &&
++	    new_a.journal_seq) {
++		ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
++				c->journal.flushed_seq_ondisk,
++				new.k->p.inode, new.k->p.offset,
++				new_a.journal_seq);
++		if (ret) {
++			bch2_fs_fatal_error(c,
++				"error setting bucket_needs_journal_commit: %i", ret);
++			return ret;
++		}
++	}
++
++	percpu_down_read(&c->mark_lock);
++	if (!gc && new_a.gen != old_a.gen)
++		*bucket_gen(ca, new.k->p.offset) = new_a.gen;
++
++	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
++
++	if (gc) {
++		struct bucket *g = gc_bucket(ca, new.k->p.offset);
++
++		bucket_lock(g);
++
++		g->gen_valid		= 1;
++		g->gen			= new_a.gen;
++		g->data_type		= new_a.data_type;
++		g->stripe		= new_a.stripe;
++		g->stripe_redundancy	= new_a.stripe_redundancy;
++		g->dirty_sectors	= new_a.dirty_sectors;
++		g->cached_sectors	= new_a.cached_sectors;
++
++		bucket_unlock(g);
++	}
++	percpu_up_read(&c->mark_lock);
++
++	/*
++	 * need to know if we're getting called from the invalidate path or
++	 * not:
++	 */
++
++	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
++	    old_a.cached_sectors) {
++		ret = update_cached_sectors(c, new, ca->dev_idx,
++					    -old_a.cached_sectors,
++					    journal_seq, gc);
++		if (ret) {
++			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
++			return ret;
++		}
++	}
++
++	if (new_a.data_type == BCH_DATA_free &&
++	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
++		closure_wake_up(&c->freelist_wait);
++
++	if (new_a.data_type == BCH_DATA_need_discard &&
++	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
++		bch2_do_discards(c);
++
++	if (old_a.data_type != BCH_DATA_cached &&
++	    new_a.data_type == BCH_DATA_cached &&
++	    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
++		bch2_do_invalidates(c);
++
++	if (new_a.data_type == BCH_DATA_need_gc_gens)
++		bch2_do_gc_gens(c);
++
++	return 0;
++}
++
++int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
++			      size_t b, enum bch_data_type data_type,
++			      unsigned sectors, struct gc_pos pos,
++			      unsigned flags)
++{
++	struct bucket old, new, *g;
++	int ret = 0;
++
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++	BUG_ON(data_type != BCH_DATA_sb &&
++	       data_type != BCH_DATA_journal);
++
++	/*
++	 * Backup superblock might be past the end of our normal usable space:
++	 */
++	if (b >= ca->mi.nbuckets)
++		return 0;
++
++	percpu_down_read(&c->mark_lock);
++	g = gc_bucket(ca, b);
++
++	bucket_lock(g);
++	old = *g;
++
++	if (bch2_fs_inconsistent_on(g->data_type &&
++			g->data_type != data_type, c,
++			"different types of data in same bucket: %s, %s",
++			bch2_data_types[g->data_type],
++			bch2_data_types[data_type])) {
++		ret = -EIO;
++		goto err;
++	}
++
++	if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
++			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
++			ca->dev_idx, b, g->gen,
++			bch2_data_types[g->data_type ?: data_type],
++			g->dirty_sectors, sectors)) {
++		ret = -EIO;
++		goto err;
++	}
++
++
++	g->data_type = data_type;
++	g->dirty_sectors += sectors;
++	new = *g;
++err:
++	bucket_unlock(g);
++	if (!ret)
++		bch2_dev_usage_update_m(c, ca, old, new, 0, true);
++	percpu_up_read(&c->mark_lock);
++	return ret;
++}
++
++static int check_bucket_ref(struct bch_fs *c,
++			    struct bkey_s_c k,
++			    const struct bch_extent_ptr *ptr,
++			    s64 sectors, enum bch_data_type ptr_data_type,
++			    u8 b_gen, u8 bucket_data_type,
++			    u32 dirty_sectors, u32 cached_sectors)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
++	u16 bucket_sectors = !ptr->cached
++		? dirty_sectors
++		: cached_sectors;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	if (bucket_data_type == BCH_DATA_cached)
++		bucket_data_type = BCH_DATA_user;
++
++	if (gen_after(ptr->gen, b_gen)) {
++		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
++			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
++			"while marking %s",
++			ptr->dev, bucket_nr, b_gen,
++			bch2_data_types[bucket_data_type ?: ptr_data_type],
++			ptr->gen,
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
++	}
++
++	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
++		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
++			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
++			"while marking %s",
++			ptr->dev, bucket_nr, b_gen,
++			bch2_data_types[bucket_data_type ?: ptr_data_type],
++			ptr->gen,
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
++	}
++
++	if (b_gen != ptr->gen && !ptr->cached) {
++		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
++			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
++			"while marking %s",
++			ptr->dev, bucket_nr, b_gen,
++			*bucket_gen(ca, bucket_nr),
++			bch2_data_types[bucket_data_type ?: ptr_data_type],
++			ptr->gen,
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
++	}
++
++	if (b_gen != ptr->gen) {
++		ret = 1;
++		goto err;
++	}
++
++	if (!data_type_is_empty(bucket_data_type) &&
++	    ptr_data_type &&
++	    bucket_data_type != ptr_data_type) {
++		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
++			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
++			"while marking %s",
++			ptr->dev, bucket_nr, b_gen,
++			bch2_data_types[bucket_data_type],
++			bch2_data_types[ptr_data_type],
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
++	}
++
++	if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
++		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
++			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
++			"while marking %s",
++			ptr->dev, bucket_nr, b_gen,
++			bch2_data_types[bucket_data_type ?: ptr_data_type],
++			bucket_sectors, sectors,
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
++	}
++err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int mark_stripe_bucket(struct btree_trans *trans,
++			      struct bkey_s_c k,
++			      unsigned ptr_idx,
++			      unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	u64 journal_seq = trans->journal_res.seq;
++	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
++	unsigned nr_data = s->nr_blocks - s->nr_redundant;
++	bool parity = ptr_idx >= nr_data;
++	enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
++	s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
++	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++	struct bucket old, new, *g;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
++	/* * XXX doesn't handle deletion */
++
++	percpu_down_read(&c->mark_lock);
++	buf.atomic++;
++	g = PTR_GC_BUCKET(ca, ptr);
++
++	if (g->dirty_sectors ||
++	    (g->stripe && g->stripe != k.k->p.offset)) {
++		bch2_fs_inconsistent(c,
++			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
++			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
++			      (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EINVAL;
++		goto err;
++	}
++
++	bucket_lock(g);
++	old = *g;
++
++	ret = check_bucket_ref(c, k, ptr, sectors, data_type,
++			       g->gen, g->data_type,
++			       g->dirty_sectors, g->cached_sectors);
++	if (ret)
++		goto err;
++
++	if (data_type)
++		g->data_type = data_type;
++	g->dirty_sectors += sectors;
++
++	g->stripe		= k.k->p.offset;
++	g->stripe_redundancy	= s->nr_redundant;
++	new = *g;
++err:
++	bucket_unlock(g);
++	if (!ret)
++		bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
++	percpu_up_read(&c->mark_lock);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int __mark_pointer(struct btree_trans *trans,
++			  struct bkey_s_c k,
++			  const struct bch_extent_ptr *ptr,
++			  s64 sectors, enum bch_data_type ptr_data_type,
++			  u8 bucket_gen, u8 *bucket_data_type,
++			  u32 *dirty_sectors, u32 *cached_sectors)
++{
++	u32 *dst_sectors = !ptr->cached
++		? dirty_sectors
++		: cached_sectors;
++	int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
++				   bucket_gen, *bucket_data_type,
++				   *dirty_sectors, *cached_sectors);
++
++	if (ret)
++		return ret;
++
++	*dst_sectors += sectors;
++	*bucket_data_type = *dirty_sectors || *cached_sectors
++		? ptr_data_type : 0;
++	return 0;
++}
++
++static int bch2_mark_pointer(struct btree_trans *trans,
++			     struct bkey_s_c k,
++			     struct extent_ptr_decoded p,
++			     s64 sectors, enum bch_data_type data_type,
++			     unsigned flags)
++{
++	u64 journal_seq = trans->journal_res.seq;
++	struct bch_fs *c = trans->c;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
++	struct bucket old, new, *g;
++	u8 bucket_data_type;
++	int ret = 0;
++
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
++	percpu_down_read(&c->mark_lock);
++	g = PTR_GC_BUCKET(ca, &p.ptr);
++	bucket_lock(g);
++	old = *g;
++
++	bucket_data_type = g->data_type;
++	ret = __mark_pointer(trans, k, &p.ptr, sectors,
++			     data_type, g->gen,
++			     &bucket_data_type,
++			     &g->dirty_sectors,
++			     &g->cached_sectors);
++	if (!ret)
++		g->data_type = bucket_data_type;
++
++	new = *g;
++	bucket_unlock(g);
++	if (!ret)
++		bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
++	percpu_up_read(&c->mark_lock);
++
++	return ret;
++}
++
++static int bch2_mark_stripe_ptr(struct btree_trans *trans,
++				struct bkey_s_c k,
++				struct bch_extent_stripe_ptr p,
++				enum bch_data_type data_type,
++				s64 sectors,
++				unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_replicas_padded r;
++	struct gc_stripe *m;
++
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
++	m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
++	if (!m) {
++		bch_err(c, "error allocating memory for gc_stripes, idx %llu",
++			(u64) p.idx);
++		return -ENOMEM;
++	}
++
++	spin_lock(&c->ec_stripes_heap_lock);
++
++	if (!m || !m->alive) {
++		spin_unlock(&c->ec_stripes_heap_lock);
++		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
++				    (u64) p.idx);
++		bch2_inconsistent_error(c);
++		return -EIO;
++	}
++
++	m->block_sectors[p.block] += sectors;
++
++	r = m->r;
++	spin_unlock(&c->ec_stripes_heap_lock);
++
++	r.e.data_type = data_type;
++	update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
++
++	return 0;
++}
++
++int bch2_mark_extent(struct btree_trans *trans,
++		     struct bkey_s_c old, struct bkey_s_c new,
++		     unsigned flags)
++{
++	u64 journal_seq = trans->journal_res.seq;
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++	struct bch_replicas_padded r;
++	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
++		? BCH_DATA_btree
++		: BCH_DATA_user;
++	s64 sectors = bkey_is_btree_ptr(k.k)
++		? btree_sectors(c)
++		: k.k->size;
++	s64 dirty_sectors = 0;
++	bool stale;
++	int ret;
++
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
++	r.e.data_type	= data_type;
++	r.e.nr_devs	= 0;
++	r.e.nr_required	= 1;
++
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++		s64 disk_sectors = ptr_disk_sectors(sectors, p);
++
++		if (flags & BTREE_TRIGGER_OVERWRITE)
++			disk_sectors = -disk_sectors;
++
++		ret = bch2_mark_pointer(trans, k, p, disk_sectors,
++					data_type, flags);
++		if (ret < 0)
++			return ret;
++
++		stale = ret > 0;
++
++		if (p.ptr.cached) {
++			if (!stale) {
++				ret = update_cached_sectors(c, k, p.ptr.dev,
++						disk_sectors, journal_seq, true);
++				if (ret) {
++					bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
++					return ret;
++				}
++			}
++		} else if (!p.has_ec) {
++			dirty_sectors	       += disk_sectors;
++			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
++		} else {
++			ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
++					disk_sectors, flags);
++			if (ret)
++				return ret;
++
++			/*
++			 * There may be other dirty pointers in this extent, but
++			 * if so they're not required for mounting if we have an
++			 * erasure coded pointer in this extent:
++			 */
++			r.e.nr_required = 0;
++		}
++	}
++
++	if (r.e.nr_devs) {
++		ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
++		if (ret) {
++			struct printbuf buf = PRINTBUF;
++
++			bch2_bkey_val_to_text(&buf, c, k);
++			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
++			printbuf_exit(&buf);
++			return ret;
++		}
++	}
++
++	return 0;
++}
++
++int bch2_mark_stripe(struct btree_trans *trans,
++		     struct bkey_s_c old, struct bkey_s_c new,
++		     unsigned flags)
++{
++	bool gc = flags & BTREE_TRIGGER_GC;
++	u64 journal_seq = trans->journal_res.seq;
++	struct bch_fs *c = trans->c;
++	u64 idx = new.k->p.offset;
++	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
++		? bkey_s_c_to_stripe(old).v : NULL;
++	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
++		? bkey_s_c_to_stripe(new).v : NULL;
++	unsigned i;
++	int ret;
++
++	BUG_ON(gc && old_s);
++
++	if (!gc) {
++		struct stripe *m = genradix_ptr(&c->stripes, idx);
++
++		if (!m || (old_s && !m->alive)) {
++			struct printbuf buf1 = PRINTBUF;
++			struct printbuf buf2 = PRINTBUF;
++
++			bch2_bkey_val_to_text(&buf1, c, old);
++			bch2_bkey_val_to_text(&buf2, c, new);
++			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
++					    "old %s\n"
++					    "new %s", idx, buf1.buf, buf2.buf);
++			printbuf_exit(&buf2);
++			printbuf_exit(&buf1);
++			bch2_inconsistent_error(c);
++			return -1;
++		}
++
++		if (!new_s) {
++			spin_lock(&c->ec_stripes_heap_lock);
++			bch2_stripes_heap_del(c, m, idx);
++			spin_unlock(&c->ec_stripes_heap_lock);
++
++			memset(m, 0, sizeof(*m));
++		} else {
++			m->alive	= true;
++			m->sectors	= le16_to_cpu(new_s->sectors);
++			m->algorithm	= new_s->algorithm;
++			m->nr_blocks	= new_s->nr_blocks;
++			m->nr_redundant	= new_s->nr_redundant;
++			m->blocks_nonempty = 0;
++
++			for (i = 0; i < new_s->nr_blocks; i++)
++				m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
++
++			spin_lock(&c->ec_stripes_heap_lock);
++			bch2_stripes_heap_update(c, m, idx);
++			spin_unlock(&c->ec_stripes_heap_lock);
++		}
++	} else {
++		struct gc_stripe *m =
++			genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
++
++		if (!m) {
++			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
++				idx);
++			return -ENOMEM;
++		}
++		/*
++		 * This will be wrong when we bring back runtime gc: we should
++		 * be unmarking the old key and then marking the new key
++		 */
++		m->alive	= true;
++		m->sectors	= le16_to_cpu(new_s->sectors);
++		m->nr_blocks	= new_s->nr_blocks;
++		m->nr_redundant	= new_s->nr_redundant;
++
++		for (i = 0; i < new_s->nr_blocks; i++)
++			m->ptrs[i] = new_s->ptrs[i];
++
++		bch2_bkey_to_replicas(&m->r.e, new);
++
++		/*
++		 * gc recalculates this field from stripe ptr
++		 * references:
++		 */
++		memset(m->block_sectors, 0, sizeof(m->block_sectors));
++
++		for (i = 0; i < new_s->nr_blocks; i++) {
++			ret = mark_stripe_bucket(trans, new, i, flags);
++			if (ret)
++				return ret;
++		}
++
++		ret = update_replicas(c, new, &m->r.e,
++				      ((s64) m->sectors * m->nr_redundant),
++				      journal_seq, gc);
++		if (ret) {
++			struct printbuf buf = PRINTBUF;
++
++			bch2_bkey_val_to_text(&buf, c, new);
++			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
++			printbuf_exit(&buf);
++			return ret;
++		}
++	}
++
++	return 0;
++}
++
++int bch2_mark_inode(struct btree_trans *trans,
++		    struct bkey_s_c old, struct bkey_s_c new,
++		    unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_fs_usage __percpu *fs_usage;
++	u64 journal_seq = trans->journal_res.seq;
++
++	if (flags & BTREE_TRIGGER_INSERT) {
++		struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
++
++		BUG_ON(!journal_seq);
++		BUG_ON(new.k->type != KEY_TYPE_inode_v2);
++
++		v->bi_journal_seq = cpu_to_le64(journal_seq);
++	}
++
++	if (flags & BTREE_TRIGGER_GC) {
++		percpu_down_read(&c->mark_lock);
++		preempt_disable();
++
++		fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
++		fs_usage->nr_inodes += bkey_is_inode(new.k);
++		fs_usage->nr_inodes -= bkey_is_inode(old.k);
++
++		preempt_enable();
++		percpu_up_read(&c->mark_lock);
++	}
++	return 0;
++}
++
++int bch2_mark_reservation(struct btree_trans *trans,
++			  struct bkey_s_c old, struct bkey_s_c new,
++			  unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
++	struct bch_fs_usage __percpu *fs_usage;
++	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
++	s64 sectors = (s64) k.k->size;
++
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
++	if (flags & BTREE_TRIGGER_OVERWRITE)
++		sectors = -sectors;
++	sectors *= replicas;
++
++	percpu_down_read(&c->mark_lock);
++	preempt_disable();
++
++	fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
++	replicas = clamp_t(unsigned, replicas, 1,
++			   ARRAY_SIZE(fs_usage->persistent_reserved));
++
++	fs_usage->reserved				+= sectors;
++	fs_usage->persistent_reserved[replicas - 1]	+= sectors;
++
++	preempt_enable();
++	percpu_up_read(&c->mark_lock);
++
++	return 0;
++}
++
++static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
++				 struct bkey_s_c_reflink_p p,
++				 u64 start, u64 end,
++				 u64 *idx, unsigned flags, size_t r_idx)
++{
++	struct bch_fs *c = trans->c;
++	struct reflink_gc *r;
++	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
++	u64 next_idx = end;
++	s64 ret = 0;
++	struct printbuf buf = PRINTBUF;
++
++	if (r_idx >= c->reflink_gc_nr)
++		goto not_found;
++
++	r = genradix_ptr(&c->reflink_gc_table, r_idx);
++	next_idx = min(next_idx, r->offset - r->size);
++	if (*idx < next_idx)
++		goto not_found;
++
++	BUG_ON((s64) r->refcount + add < 0);
++
++	r->refcount += add;
++	*idx = r->offset;
++	return 0;
++not_found:
++	if (fsck_err(c, "pointer to missing indirect extent\n"
++		     "  %s\n"
++		     "  missing range %llu-%llu",
++		     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
++		     *idx, next_idx)) {
++		struct bkey_i_error new;
++
++		bkey_init(&new.k);
++		new.k.type	= KEY_TYPE_error;
++		new.k.p		= bkey_start_pos(p.k);
++		new.k.p.offset += *idx - start;
++		bch2_key_resize(&new.k, next_idx - *idx);
++		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i);
++	}
++
++	*idx = next_idx;
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_mark_reflink_p(struct btree_trans *trans,
++			struct bkey_s_c old, struct bkey_s_c new,
++			unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
++	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
++	struct reflink_gc *ref;
++	size_t l, r, m;
++	u64 idx = le64_to_cpu(p.v->idx), start = idx;
++	u64 end = le64_to_cpu(p.v->idx) + p.k->size;
++	int ret = 0;
++
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
++	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) {
++		idx -= le32_to_cpu(p.v->front_pad);
++		end += le32_to_cpu(p.v->back_pad);
++	}
++
++	l = 0;
++	r = c->reflink_gc_nr;
++	while (l < r) {
++		m = l + (r - l) / 2;
++
++		ref = genradix_ptr(&c->reflink_gc_table, m);
++		if (ref->offset <= idx)
++			l = m + 1;
++		else
++			r = m;
++	}
++
++	while (idx < end && !ret)
++		ret = __bch2_mark_reflink_p(trans, p, start, end,
++					    &idx, flags, l++);
++
++	return ret;
++}
++
++static noinline __cold
++void fs_usage_apply_warn(struct btree_trans *trans,
++			 unsigned disk_res_sectors,
++			 s64 should_not_have_added)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
++	struct printbuf buf = PRINTBUF;
++
++	bch_err(c, "disk usage increased %lli more than %u sectors reserved",
++		should_not_have_added, disk_res_sectors);
++
++	trans_for_each_update(trans, i) {
++		struct bkey_s_c old = { &i->old_k, i->old_v };
++
++		pr_err("while inserting");
++		printbuf_reset(&buf);
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
++		pr_err("  %s", buf.buf);
++		pr_err("overlapping with");
++		printbuf_reset(&buf);
++		bch2_bkey_val_to_text(&buf, c, old);
++		pr_err("  %s", buf.buf);
++	}
++
++	__WARN();
++	printbuf_exit(&buf);
++}
++
++int bch2_trans_fs_usage_apply(struct btree_trans *trans,
++			      struct replicas_delta_list *deltas)
++{
++	struct bch_fs *c = trans->c;
++	static int warned_disk_usage = 0;
++	bool warn = false;
++	unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
++	struct replicas_delta *d = deltas->d, *d2;
++	struct replicas_delta *top = (void *) deltas->d + deltas->used;
++	struct bch_fs_usage *dst;
++	s64 added = 0, should_not_have_added;
++	unsigned i;
++
++	percpu_down_read(&c->mark_lock);
++	preempt_disable();
++	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
++
++	for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
++		switch (d->r.data_type) {
++		case BCH_DATA_btree:
++		case BCH_DATA_user:
++		case BCH_DATA_parity:
++			added += d->delta;
++		}
++
++		if (__update_replicas(c, dst, &d->r, d->delta))
++			goto need_mark;
++	}
++
++	dst->nr_inodes += deltas->nr_inodes;
++
++	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
++		added				+= deltas->persistent_reserved[i];
++		dst->reserved			+= deltas->persistent_reserved[i];
++		dst->persistent_reserved[i]	+= deltas->persistent_reserved[i];
++	}
++
++	/*
++	 * Not allowed to reduce sectors_available except by getting a
++	 * reservation:
++	 */
++	should_not_have_added = added - (s64) disk_res_sectors;
++	if (unlikely(should_not_have_added > 0)) {
++		u64 old, new, v = atomic64_read(&c->sectors_available);
++
++		do {
++			old = v;
++			new = max_t(s64, 0, old - should_not_have_added);
++		} while ((v = atomic64_cmpxchg(&c->sectors_available,
++					       old, new)) != old);
++
++		added -= should_not_have_added;
++		warn = true;
++	}
++
++	if (added > 0) {
++		trans->disk_res->sectors -= added;
++		this_cpu_sub(*c->online_reserved, added);
++	}
++
++	preempt_enable();
++	percpu_up_read(&c->mark_lock);
++
++	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
++		fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
++	return 0;
++need_mark:
++	/* revert changes: */
++	for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
++		BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
++
++	preempt_enable();
++	percpu_up_read(&c->mark_lock);
++	return -1;
++}
++
++/* trans_mark: */
++
++static int bch2_trans_mark_pointer(struct btree_trans *trans,
++				   enum btree_id btree_id, unsigned level,
++				   struct bkey_s_c k, struct extent_ptr_decoded p,
++				   unsigned flags)
++{
++	bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
++	struct btree_iter iter;
++	struct bkey_i_alloc_v4 *a;
++	struct bpos bucket_pos;
++	struct bch_backpointer bp;
++	s64 sectors;
++	int ret;
++
++	bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp);
++	sectors = bp.bucket_len;
++	if (!insert)
++		sectors = -sectors;
++
++	a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos);
++	if (IS_ERR(a))
++		return PTR_ERR(a);
++
++	ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
++			     a->v.gen, &a->v.data_type,
++			     &a->v.dirty_sectors, &a->v.cached_sectors);
++	if (ret)
++		goto err;
++
++	if (!p.ptr.cached) {
++		ret = insert
++			? bch2_bucket_backpointer_add(trans, a, bp, k)
++			: bch2_bucket_backpointer_del(trans, a, bp, k);
++		if (ret)
++			goto err;
++	}
++
++	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
++			struct extent_ptr_decoded p,
++			s64 sectors, enum bch_data_type data_type)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i_stripe *s;
++	struct bch_replicas_padded r;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
++			     BTREE_ITER_INTENT|
++			     BTREE_ITER_WITH_UPDATES);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_stripe) {
++		bch2_trans_inconsistent(trans,
++			"pointer to nonexistent stripe %llu",
++			(u64) p.ec.idx);
++		ret = -EIO;
++		goto err;
++	}
++
++	if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
++		bch2_trans_inconsistent(trans,
++			"stripe pointer doesn't match stripe %llu",
++			(u64) p.ec.idx);
++		ret = -EIO;
++		goto err;
++	}
++
++	s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(s);
++	if (ret)
++		goto err;
++
++	bkey_reassemble(&s->k_i, k);
++	stripe_blockcount_set(&s->v, p.ec.block,
++		stripe_blockcount_get(&s->v, p.ec.block) +
++		sectors);
++
++	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
++	if (ret)
++		goto err;
++
++	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
++	r.e.data_type = data_type;
++	update_replicas_list(trans, &r.e, sectors);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_trans_mark_extent(struct btree_trans *trans,
++			   enum btree_id btree_id, unsigned level,
++			   struct bkey_s_c old, struct bkey_i *new,
++			   unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
++		? old
++		: bkey_i_to_s_c(new);
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++	struct bch_replicas_padded r;
++	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
++		? BCH_DATA_btree
++		: BCH_DATA_user;
++	s64 sectors = bkey_is_btree_ptr(k.k)
++		? btree_sectors(c)
++		: k.k->size;
++	s64 dirty_sectors = 0;
++	bool stale;
++	int ret;
++
++	r.e.data_type	= data_type;
++	r.e.nr_devs	= 0;
++	r.e.nr_required	= 1;
++
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++		s64 disk_sectors = ptr_disk_sectors(sectors, p);
++
++		if (flags & BTREE_TRIGGER_OVERWRITE)
++			disk_sectors = -disk_sectors;
++
++		ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
++		if (ret < 0)
++			return ret;
++
++		stale = ret > 0;
++
++		if (p.ptr.cached) {
++			if (!stale)
++				update_cached_sectors_list(trans, p.ptr.dev,
++							   disk_sectors);
++		} else if (!p.has_ec) {
++			dirty_sectors	       += disk_sectors;
++			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
++		} else {
++			ret = bch2_trans_mark_stripe_ptr(trans, p,
++					disk_sectors, data_type);
++			if (ret)
++				return ret;
++
++			r.e.nr_required = 0;
++		}
++	}
++
++	if (r.e.nr_devs)
++		update_replicas_list(trans, &r.e, dirty_sectors);
++
++	return 0;
++}
++
++static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
++					 struct bkey_s_c_stripe s,
++					 unsigned idx, bool deleting)
++{
++	struct bch_fs *c = trans->c;
++	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
++	struct btree_iter iter;
++	struct bkey_i_alloc_v4 *a;
++	enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
++		? BCH_DATA_parity : 0;
++	s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
++	int ret = 0;
++
++	if (deleting)
++		sectors = -sectors;
++
++	a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
++	if (IS_ERR(a))
++		return PTR_ERR(a);
++
++	ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
++			       a->v.gen, a->v.data_type,
++			       a->v.dirty_sectors, a->v.cached_sectors);
++	if (ret)
++		goto err;
++
++	if (!deleting) {
++		if (bch2_trans_inconsistent_on(a->v.stripe ||
++					       a->v.stripe_redundancy, trans,
++				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
++				iter.pos.inode, iter.pos.offset, a->v.gen,
++				bch2_data_types[a->v.data_type],
++				a->v.dirty_sectors,
++				a->v.stripe, s.k->p.offset)) {
++			ret = -EIO;
++			goto err;
++		}
++
++		if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
++				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
++				iter.pos.inode, iter.pos.offset, a->v.gen,
++				bch2_data_types[a->v.data_type],
++				a->v.dirty_sectors,
++				s.k->p.offset)) {
++			ret = -EIO;
++			goto err;
++		}
++
++		a->v.stripe		= s.k->p.offset;
++		a->v.stripe_redundancy	= s.v->nr_redundant;
++	} else {
++		if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
++					       a->v.stripe_redundancy != s.v->nr_redundant, trans,
++				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
++				iter.pos.inode, iter.pos.offset, a->v.gen,
++				s.k->p.offset, a->v.stripe)) {
++			ret = -EIO;
++			goto err;
++		}
++
++		a->v.stripe		= 0;
++		a->v.stripe_redundancy	= 0;
++	}
++
++	a->v.dirty_sectors += sectors;
++	if (data_type)
++		a->v.data_type = !deleting ? data_type : 0;
++
++	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
++	if (ret)
++		goto err;
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_trans_mark_stripe(struct btree_trans *trans,
++			   enum btree_id btree_id, unsigned level,
++			   struct bkey_s_c old, struct bkey_i *new,
++			   unsigned flags)
++{
++	const struct bch_stripe *old_s = NULL;
++	struct bch_stripe *new_s = NULL;
++	struct bch_replicas_padded r;
++	unsigned i, nr_blocks;
++	int ret = 0;
++
++	if (old.k->type == KEY_TYPE_stripe)
++		old_s = bkey_s_c_to_stripe(old).v;
++	if (new->k.type == KEY_TYPE_stripe)
++		new_s = &bkey_i_to_stripe(new)->v;
++
++	/*
++	 * If the pointers aren't changing, we don't need to do anything:
++	 */
++	if (new_s && old_s &&
++	    new_s->nr_blocks	== old_s->nr_blocks &&
++	    new_s->nr_redundant	== old_s->nr_redundant &&
++	    !memcmp(old_s->ptrs, new_s->ptrs,
++		    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
++		return 0;
++
++	BUG_ON(new_s && old_s &&
++	       (new_s->nr_blocks	!= old_s->nr_blocks ||
++		new_s->nr_redundant	!= old_s->nr_redundant));
++
++	nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
++
++	if (new_s) {
++		s64 sectors = le16_to_cpu(new_s->sectors);
++
++		bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
++		update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
++	}
++
++	if (old_s) {
++		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
++
++		bch2_bkey_to_replicas(&r.e, old);
++		update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
++	}
++
++	for (i = 0; i < nr_blocks; i++) {
++		if (new_s && old_s &&
++		    !memcmp(&new_s->ptrs[i],
++			    &old_s->ptrs[i],
++			    sizeof(new_s->ptrs[i])))
++			continue;
++
++		if (new_s) {
++			ret = bch2_trans_mark_stripe_bucket(trans,
++					bkey_i_to_s_c_stripe(new), i, false);
++			if (ret)
++				break;
++		}
++
++		if (old_s) {
++			ret = bch2_trans_mark_stripe_bucket(trans,
++					bkey_s_c_to_stripe(old), i, true);
++			if (ret)
++				break;
++		}
++	}
++
++	return ret;
++}
++
++int bch2_trans_mark_inode(struct btree_trans *trans,
++			  enum btree_id btree_id, unsigned level,
++			  struct bkey_s_c old,
++			  struct bkey_i *new,
++			  unsigned flags)
++{
++	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
++
++	if (nr) {
++		struct replicas_delta_list *d =
++			replicas_deltas_realloc(trans, 0);
++		d->nr_inodes += nr;
++	}
++
++	return 0;
++}
++
++int bch2_trans_mark_reservation(struct btree_trans *trans,
++				enum btree_id btree_id, unsigned level,
++				struct bkey_s_c old,
++				struct bkey_i *new,
++				unsigned flags)
++{
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
++		? old
++		: bkey_i_to_s_c(new);
++	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
++	s64 sectors = (s64) k.k->size;
++	struct replicas_delta_list *d;
++
++	if (flags & BTREE_TRIGGER_OVERWRITE)
++		sectors = -sectors;
++	sectors *= replicas;
++
++	d = replicas_deltas_realloc(trans, 0);
++
++	replicas = clamp_t(unsigned, replicas, 1,
++			   ARRAY_SIZE(d->persistent_reserved));
++
++	d->persistent_reserved[replicas - 1] += sectors;
++	return 0;
++}
++
++static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
++			struct bkey_s_c_reflink_p p,
++			u64 *idx, unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i *n;
++	__le64 *refcount;
++	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
++			     BTREE_ITER_INTENT|
++			     BTREE_ITER_WITH_UPDATES);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(n);
++	if (ret)
++		goto err;
++
++	bkey_reassemble(n, k);
++
++	refcount = bkey_refcount(n);
++	if (!refcount) {
++		bch2_bkey_val_to_text(&buf, c, p.s_c);
++		bch2_trans_inconsistent(trans,
++			"nonexistent indirect extent at %llu while marking\n  %s",
++			*idx, buf.buf);
++		ret = -EIO;
++		goto err;
++	}
++
++	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
++		bch2_bkey_val_to_text(&buf, c, p.s_c);
++		bch2_trans_inconsistent(trans,
++			"indirect extent refcount underflow at %llu while marking\n  %s",
++			*idx, buf.buf);
++		ret = -EIO;
++		goto err;
++	}
++
++	if (flags & BTREE_TRIGGER_INSERT) {
++		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
++		u64 pad;
++
++		pad = max_t(s64, le32_to_cpu(v->front_pad),
++			    le64_to_cpu(v->idx) - bkey_start_offset(k.k));
++		BUG_ON(pad > U32_MAX);
++		v->front_pad = cpu_to_le32(pad);
++
++		pad = max_t(s64, le32_to_cpu(v->back_pad),
++			    k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
++		BUG_ON(pad > U32_MAX);
++		v->back_pad = cpu_to_le32(pad);
++	}
++
++	le64_add_cpu(refcount, add);
++
++	bch2_btree_iter_set_pos_to_extent_start(&iter);
++	ret = bch2_trans_update(trans, &iter, n, 0);
++	if (ret)
++		goto err;
++
++	*idx = k.k->p.offset;
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_trans_mark_reflink_p(struct btree_trans *trans,
++			      enum btree_id btree_id, unsigned level,
++			      struct bkey_s_c old,
++			      struct bkey_i *new,
++			      unsigned flags)
++{
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
++		? old
++		: bkey_i_to_s_c(new);
++	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
++	u64 idx, end_idx;
++	int ret = 0;
++
++	if (flags & BTREE_TRIGGER_INSERT) {
++		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
++
++		v->front_pad = v->back_pad = 0;
++	}
++
++	idx	= le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
++	end_idx = le64_to_cpu(p.v->idx) + p.k->size +
++		le32_to_cpu(p.v->back_pad);
++
++	while (idx < end_idx && !ret)
++		ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
++
++	return ret;
++}
++
++static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
++				    struct bch_dev *ca, size_t b,
++				    enum bch_data_type type,
++				    unsigned sectors)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_i_alloc_v4 *a;
++	int ret = 0;
++
++	/*
++	 * Backup superblock might be past the end of our normal usable space:
++	 */
++	if (b >= ca->mi.nbuckets)
++		return 0;
++
++	a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
++	if (IS_ERR(a))
++		return PTR_ERR(a);
++
++	if (a->v.data_type && a->v.data_type != type) {
++		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
++			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
++			"while marking %s",
++			iter.pos.inode, iter.pos.offset, a->v.gen,
++			bch2_data_types[a->v.data_type],
++			bch2_data_types[type],
++			bch2_data_types[type]);
++		ret = -EIO;
++		goto out;
++	}
++
++	a->v.data_type		= type;
++	a->v.dirty_sectors	= sectors;
++
++	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
++	if (ret)
++		goto out;
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
++				    struct bch_dev *ca, size_t b,
++				    enum bch_data_type type,
++				    unsigned sectors)
++{
++	return commit_do(trans, NULL, NULL, 0,
++			__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
++}
++
++static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
++					    struct bch_dev *ca,
++					    u64 start, u64 end,
++					    enum bch_data_type type,
++					    u64 *bucket, unsigned *bucket_sectors)
++{
++	do {
++		u64 b = sector_to_bucket(ca, start);
++		unsigned sectors =
++			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
++
++		if (b != *bucket && *bucket_sectors) {
++			int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
++								  type, *bucket_sectors);
++			if (ret)
++				return ret;
++
++			*bucket_sectors = 0;
++		}
++
++		*bucket		= b;
++		*bucket_sectors	+= sectors;
++		start += sectors;
++	} while (start < end);
++
++	return 0;
++}
++
++static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
++				    struct bch_dev *ca)
++{
++	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
++	u64 bucket = 0;
++	unsigned i, bucket_sectors = 0;
++	int ret;
++
++	for (i = 0; i < layout->nr_superblocks; i++) {
++		u64 offset = le64_to_cpu(layout->sb_offset[i]);
++
++		if (offset == BCH_SB_SECTOR) {
++			ret = bch2_trans_mark_metadata_sectors(trans, ca,
++						0, BCH_SB_SECTOR,
++						BCH_DATA_sb, &bucket, &bucket_sectors);
++			if (ret)
++				return ret;
++		}
++
++		ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
++				      offset + (1 << layout->sb_max_size_bits),
++				      BCH_DATA_sb, &bucket, &bucket_sectors);
++		if (ret)
++			return ret;
++	}
++
++	if (bucket_sectors) {
++		ret = bch2_trans_mark_metadata_bucket(trans, ca,
++				bucket, BCH_DATA_sb, bucket_sectors);
++		if (ret)
++			return ret;
++	}
++
++	for (i = 0; i < ca->journal.nr; i++) {
++		ret = bch2_trans_mark_metadata_bucket(trans, ca,
++				ca->journal.buckets[i],
++				BCH_DATA_journal, ca->mi.bucket_size);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
++{
++	return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
++}
++
++/* Disk reservations: */
++
++#define SECTORS_CACHE	1024
++
++int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
++			      u64 sectors, int flags)
++{
++	struct bch_fs_pcpu *pcpu;
++	u64 old, v, get;
++	s64 sectors_available;
++	int ret;
++
++	percpu_down_read(&c->mark_lock);
++	preempt_disable();
++	pcpu = this_cpu_ptr(c->pcpu);
++
++	if (sectors <= pcpu->sectors_available)
++		goto out;
++
++	v = atomic64_read(&c->sectors_available);
++	do {
++		old = v;
++		get = min((u64) sectors + SECTORS_CACHE, old);
++
++		if (get < sectors) {
++			preempt_enable();
++			goto recalculate;
++		}
++	} while ((v = atomic64_cmpxchg(&c->sectors_available,
++				       old, old - get)) != old);
++
++	pcpu->sectors_available		+= get;
++
++out:
++	pcpu->sectors_available		-= sectors;
++	this_cpu_add(*c->online_reserved, sectors);
++	res->sectors			+= sectors;
++
++	preempt_enable();
++	percpu_up_read(&c->mark_lock);
++	return 0;
++
++recalculate:
++	mutex_lock(&c->sectors_available_lock);
++
++	percpu_u64_set(&c->pcpu->sectors_available, 0);
++	sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
++
++	if (sectors <= sectors_available ||
++	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
++		atomic64_set(&c->sectors_available,
++			     max_t(s64, 0, sectors_available - sectors));
++		this_cpu_add(*c->online_reserved, sectors);
++		res->sectors			+= sectors;
++		ret = 0;
++	} else {
++		atomic64_set(&c->sectors_available, sectors_available);
++		ret = -ENOSPC;
++	}
++
++	mutex_unlock(&c->sectors_available_lock);
++	percpu_up_read(&c->mark_lock);
++
++	return ret;
++}
++
++/* Startup/shutdown: */
++
++static void bucket_gens_free_rcu(struct rcu_head *rcu)
++{
++	struct bucket_gens *buckets =
++		container_of(rcu, struct bucket_gens, rcu);
++
++	kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
++}
++
++int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
++{
++	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
++	unsigned long *buckets_nouse = NULL;
++	bool resize = ca->bucket_gens != NULL;
++	int ret = -ENOMEM;
++
++	if (!(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
++					    GFP_KERNEL|__GFP_ZERO)) ||
++	    (c->opts.buckets_nouse &&
++	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
++					    sizeof(unsigned long),
++					    GFP_KERNEL|__GFP_ZERO))))
++		goto err;
++
++	bucket_gens->first_bucket = ca->mi.first_bucket;
++	bucket_gens->nbuckets	= nbuckets;
++
++	bch2_copygc_stop(c);
++
++	if (resize) {
++		down_write(&c->gc_lock);
++		down_write(&ca->bucket_lock);
++		percpu_down_write(&c->mark_lock);
++	}
++
++	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
++
++	if (resize) {
++		size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
++
++		memcpy(bucket_gens->b,
++		       old_bucket_gens->b,
++		       n);
++		if (buckets_nouse)
++			memcpy(buckets_nouse,
++			       ca->buckets_nouse,
++			       BITS_TO_LONGS(n) * sizeof(unsigned long));
++	}
++
++	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
++	bucket_gens	= old_bucket_gens;
++
++	swap(ca->buckets_nouse, buckets_nouse);
++
++	nbuckets = ca->mi.nbuckets;
++
++	if (resize) {
++		percpu_up_write(&c->mark_lock);
++		up_write(&ca->bucket_lock);
++		up_write(&c->gc_lock);
++	}
++
++	ret = 0;
++err:
++	kvpfree(buckets_nouse,
++		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
++	if (bucket_gens)
++		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
++
++	return ret;
++}
++
++void bch2_dev_buckets_free(struct bch_dev *ca)
++{
++	unsigned i;
++
++	kvpfree(ca->buckets_nouse,
++		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
++	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
++		sizeof(struct bucket_gens) + ca->mi.nbuckets);
++
++	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
++		free_percpu(ca->usage[i]);
++	kfree(ca->usage_base);
++}
++
++int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
++{
++	unsigned i;
++
++	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
++	if (!ca->usage_base)
++		return -ENOMEM;
++
++	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
++		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
++		if (!ca->usage[i])
++			return -ENOMEM;
++	}
++
++	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
++}
+diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
+new file mode 100644
+index 000000000000..6881502d95f1
+--- /dev/null
++++ b/fs/bcachefs/buckets.h
+@@ -0,0 +1,300 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Code for manipulating bucket marks for garbage collection.
++ *
++ * Copyright 2014 Datera, Inc.
++ */
++
++#ifndef _BUCKETS_H
++#define _BUCKETS_H
++
++#include "buckets_types.h"
++#include "extents.h"
++#include "super.h"
++
++#define for_each_bucket(_b, _buckets)				\
++	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
++	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
++
++static inline void bucket_unlock(struct bucket *b)
++{
++	smp_store_release(&b->lock, 0);
++}
++
++static inline void bucket_lock(struct bucket *b)
++{
++	while (xchg(&b->lock, 1))
++		cpu_relax();
++}
++
++static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
++{
++	return rcu_dereference_check(ca->buckets_gc,
++				     !ca->fs ||
++				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
++				     lockdep_is_held(&ca->fs->gc_lock) ||
++				     lockdep_is_held(&ca->bucket_lock));
++}
++
++static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
++{
++	struct bucket_array *buckets = gc_bucket_array(ca);
++
++	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
++	return buckets->b + b;
++}
++
++static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
++{
++	return rcu_dereference_check(ca->bucket_gens,
++				     !ca->fs ||
++				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
++				     lockdep_is_held(&ca->fs->gc_lock) ||
++				     lockdep_is_held(&ca->bucket_lock));
++}
++
++static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
++{
++	struct bucket_gens *gens = bucket_gens(ca);
++
++	BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
++	return gens->b + b;
++}
++
++static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
++				   const struct bch_extent_ptr *ptr)
++{
++	return sector_to_bucket(ca, ptr->offset);
++}
++
++static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
++				   const struct bch_extent_ptr *ptr)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++
++	return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
++}
++
++static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
++						const struct bch_extent_ptr *ptr,
++						u32 *bucket_offset)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++
++	return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
++}
++
++static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
++					   const struct bch_extent_ptr *ptr)
++{
++	return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
++}
++
++static inline enum bch_data_type ptr_data_type(const struct bkey *k,
++					       const struct bch_extent_ptr *ptr)
++{
++	if (bkey_is_btree_ptr(k))
++		return BCH_DATA_btree;
++
++	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
++}
++
++static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
++{
++	EBUG_ON(sectors < 0);
++
++	return crc_is_compressed(p.crc)
++		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
++				   p.crc.uncompressed_size)
++		: sectors;
++}
++
++static inline int gen_cmp(u8 a, u8 b)
++{
++	return (s8) (a - b);
++}
++
++static inline int gen_after(u8 a, u8 b)
++{
++	int r = gen_cmp(a, b);
++
++	return r > 0 ? r : 0;
++}
++
++/**
++ * ptr_stale() - check if a pointer points into a bucket that has been
++ * invalidated.
++ */
++static inline u8 ptr_stale(struct bch_dev *ca,
++			   const struct bch_extent_ptr *ptr)
++{
++	u8 ret;
++
++	rcu_read_lock();
++	ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
++	rcu_read_unlock();
++
++	return ret;
++}
++
++/* Device usage: */
++
++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
++void bch2_dev_usage_init(struct bch_dev *);
++
++static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve)
++{
++	s64 reserved = 0;
++
++	switch (reserve) {
++	case RESERVE_none:
++		reserved += ca->mi.nbuckets >> 6;
++		fallthrough;
++	case RESERVE_movinggc:
++		reserved += ca->nr_btree_reserve;
++		fallthrough;
++	case RESERVE_btree:
++		reserved += ca->nr_btree_reserve;
++		fallthrough;
++	case RESERVE_btree_movinggc:
++		break;
++	}
++
++	return reserved;
++}
++
++static inline u64 dev_buckets_free(struct bch_dev *ca,
++				   struct bch_dev_usage usage,
++				   enum alloc_reserve reserve)
++{
++	return max_t(s64, 0,
++		     usage.d[BCH_DATA_free].buckets -
++		     ca->nr_open_buckets -
++		     bch2_dev_buckets_reserved(ca, reserve));
++}
++
++static inline u64 __dev_buckets_available(struct bch_dev *ca,
++					  struct bch_dev_usage usage,
++					  enum alloc_reserve reserve)
++{
++	return max_t(s64, 0,
++		       usage.d[BCH_DATA_free].buckets
++		     + usage.d[BCH_DATA_cached].buckets
++		     + usage.d[BCH_DATA_need_gc_gens].buckets
++		     + usage.d[BCH_DATA_need_discard].buckets
++		     - ca->nr_open_buckets
++		     - bch2_dev_buckets_reserved(ca, reserve));
++}
++
++static inline u64 dev_buckets_available(struct bch_dev *ca,
++					enum alloc_reserve reserve)
++{
++	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
++}
++
++/* Filesystem usage: */
++
++static inline unsigned fs_usage_u64s(struct bch_fs *c)
++{
++	return sizeof(struct bch_fs_usage) / sizeof(u64) +
++		READ_ONCE(c->replicas.nr);
++}
++
++static inline unsigned dev_usage_u64s(void)
++{
++	return sizeof(struct bch_dev_usage) / sizeof(u64);
++}
++
++u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
++
++struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
++
++void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
++
++void bch2_fs_usage_to_text(struct printbuf *,
++			   struct bch_fs *, struct bch_fs_usage_online *);
++
++u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
++
++struct bch_fs_usage_short
++bch2_fs_usage_read_short(struct bch_fs *);
++
++/* key/bucket marking: */
++
++void bch2_fs_usage_initialize(struct bch_fs *);
++
++int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
++			      size_t, enum bch_data_type, unsigned,
++			      struct gc_pos, unsigned);
++
++int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++
++int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
++int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
++int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
++int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
++int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
++
++int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++
++int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
++
++int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
++				    size_t, enum bch_data_type, unsigned);
++int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
++
++/* disk reservations: */
++
++static inline void bch2_disk_reservation_put(struct bch_fs *c,
++					     struct disk_reservation *res)
++{
++	this_cpu_sub(*c->online_reserved, res->sectors);
++	res->sectors = 0;
++}
++
++#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
++
++int bch2_disk_reservation_add(struct bch_fs *,
++			      struct disk_reservation *,
++			      u64, int);
++
++static inline struct disk_reservation
++bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
++{
++	return (struct disk_reservation) {
++		.sectors	= 0,
++#if 0
++		/* not used yet: */
++		.gen		= c->capacity_gen,
++#endif
++		.nr_replicas	= nr_replicas,
++	};
++}
++
++static inline int bch2_disk_reservation_get(struct bch_fs *c,
++					    struct disk_reservation *res,
++					    u64 sectors, unsigned nr_replicas,
++					    int flags)
++{
++	*res = bch2_disk_reservation_init(c, nr_replicas);
++
++	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
++}
++
++#define RESERVE_FACTOR	6
++
++static inline u64 avail_factor(u64 r)
++{
++	return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
++}
++
++int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
++void bch2_dev_buckets_free(struct bch_dev *);
++int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
++
++#endif /* _BUCKETS_H */
+diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
+new file mode 100644
+index 000000000000..1dbba7d906dd
+--- /dev/null
++++ b/fs/bcachefs/buckets_types.h
+@@ -0,0 +1,103 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BUCKETS_TYPES_H
++#define _BUCKETS_TYPES_H
++
++#include "bcachefs_format.h"
++#include "util.h"
++
++#define BUCKET_JOURNAL_SEQ_BITS		16
++
++struct bucket {
++	u8			lock;
++	u8			gen_valid:1;
++	u8			data_type:7;
++	u8			gen;
++	u8			stripe_redundancy;
++	u32			stripe;
++	u32			dirty_sectors;
++	u32			cached_sectors;
++};
++
++struct bucket_array {
++	struct rcu_head		rcu;
++	u16			first_bucket;
++	size_t			nbuckets;
++	struct bucket		b[];
++};
++
++struct bucket_gens {
++	struct rcu_head		rcu;
++	u16			first_bucket;
++	size_t			nbuckets;
++	u8			b[];
++};
++
++struct bch_dev_usage {
++	u64			buckets_ec;
++
++	struct {
++		u64		buckets;
++		u64		sectors; /* _compressed_ sectors: */
++		/*
++		 * XXX
++		 * Why do we have this? Isn't it just buckets * bucket_size -
++		 * sectors?
++		 */
++		u64		fragmented;
++	}			d[BCH_DATA_NR];
++};
++
++struct bch_fs_usage {
++	/* all fields are in units of 512 byte sectors: */
++	u64			hidden;
++	u64			btree;
++	u64			data;
++	u64			cached;
++	u64			reserved;
++	u64			nr_inodes;
++
++	/* XXX: add stats for compression ratio */
++#if 0
++	u64			uncompressed;
++	u64			compressed;
++#endif
++
++	/* broken out: */
++
++	u64			persistent_reserved[BCH_REPLICAS_MAX];
++	u64			replicas[];
++};
++
++struct bch_fs_usage_online {
++	u64			online_reserved;
++	struct bch_fs_usage	u;
++};
++
++struct bch_fs_usage_short {
++	u64			capacity;
++	u64			used;
++	u64			free;
++	u64			nr_inodes;
++};
++
++/*
++ * A reservation for space on disk:
++ */
++struct disk_reservation {
++	u64			sectors;
++	u32			gen;
++	unsigned		nr_replicas;
++};
++
++struct copygc_heap_entry {
++	u8			dev;
++	u8			gen;
++	u8			replicas;
++	u32			fragmentation;
++	u32			sectors;
++	u64			bucket;
++};
++
++typedef HEAP(struct copygc_heap_entry) copygc_heap;
++
++#endif /* _BUCKETS_TYPES_H */
+diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
+new file mode 100644
+index 000000000000..2e5b955080de
+--- /dev/null
++++ b/fs/bcachefs/buckets_waiting_for_journal.c
+@@ -0,0 +1,167 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "buckets_waiting_for_journal.h"
++#include <linux/random.h>
++
++static inline struct bucket_hashed *
++bucket_hash(struct buckets_waiting_for_journal_table *t,
++	    unsigned hash_seed_idx, u64 dev_bucket)
++{
++	unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]);
++
++	BUG_ON(!is_power_of_2(t->size));
++
++	return t->d + (h & (t->size - 1));
++}
++
++static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size)
++{
++	unsigned i;
++
++	t->size = size;
++	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
++		get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
++	memset(t->d, 0, sizeof(t->d[0]) * size);
++}
++
++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
++				      u64 flushed_seq,
++				      unsigned dev, u64 bucket)
++{
++	struct buckets_waiting_for_journal_table *t;
++	u64 dev_bucket = (u64) dev << 56 | bucket;
++	bool ret = false;
++	unsigned i;
++
++	mutex_lock(&b->lock);
++	t = b->t;
++
++	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
++		struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
++
++		if (h->dev_bucket == dev_bucket) {
++			ret = h->journal_seq > flushed_seq;
++			break;
++		}
++	}
++
++	mutex_unlock(&b->lock);
++
++	return ret;
++}
++
++static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
++				struct bucket_hashed *new,
++				u64 flushed_seq)
++{
++	struct bucket_hashed *last_evicted = NULL;
++	unsigned tries, i;
++
++	for (tries = 0; tries < 10; tries++) {
++		struct bucket_hashed *old, *victim = NULL;
++
++		for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
++			old = bucket_hash(t, i, new->dev_bucket);
++
++			if (old->dev_bucket == new->dev_bucket ||
++			    old->journal_seq <= flushed_seq) {
++				*old = *new;
++				return true;
++			}
++
++			if (last_evicted != old)
++				victim = old;
++		}
++
++		/* hashed to same slot 3 times: */
++		if (!victim)
++			break;
++
++		/* Failed to find an empty slot: */
++		swap(*new, *victim);
++		last_evicted = victim;
++	}
++
++	return false;
++}
++
++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
++					 u64 flushed_seq,
++					 unsigned dev, u64 bucket,
++					 u64 journal_seq)
++{
++	struct buckets_waiting_for_journal_table *t, *n;
++	struct bucket_hashed tmp, new = {
++		.dev_bucket	= (u64) dev << 56 | bucket,
++		.journal_seq	= journal_seq,
++	};
++	size_t i, new_size, nr_elements = 1, nr_rehashes = 0;
++	int ret = 0;
++
++	mutex_lock(&b->lock);
++
++	if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
++		goto out;
++
++	t = b->t;
++	for (i = 0; i < t->size; i++)
++		nr_elements += t->d[i].journal_seq > flushed_seq;
++
++	new_size = nr_elements < t->size / 3 ? t->size : t->size * 2;
++
++	n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL);
++	if (!n) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++retry_rehash:
++	nr_rehashes++;
++	bucket_table_init(n, new_size);
++
++	tmp = new;
++	BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
++
++	for (i = 0; i < t->size; i++) {
++		if (t->d[i].journal_seq <= flushed_seq)
++			continue;
++
++		tmp = t->d[i];
++		if (!bucket_table_insert(n, &tmp, flushed_seq))
++			goto retry_rehash;
++	}
++
++	b->t = n;
++	kvfree(t);
++
++	pr_debug("took %zu rehashes, table at %zu/%zu elements",
++		 nr_rehashes, nr_elements, b->t->size);
++out:
++	mutex_unlock(&b->lock);
++
++	return ret;
++}
++
++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
++{
++	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
++
++	kvfree(b->t);
++}
++
++#define INITIAL_TABLE_SIZE	8
++
++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
++{
++	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
++
++	mutex_init(&b->lock);
++
++	b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL);
++	if (!b->t)
++		return -ENOMEM;
++
++	bucket_table_init(b->t, INITIAL_TABLE_SIZE);
++	return 0;
++}
+diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
+new file mode 100644
+index 000000000000..d2ae19cbe18c
+--- /dev/null
++++ b/fs/bcachefs/buckets_waiting_for_journal.h
+@@ -0,0 +1,15 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
++#define _BUCKETS_WAITING_FOR_JOURNAL_H
++
++#include "buckets_waiting_for_journal_types.h"
++
++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
++				      u64, unsigned, u64);
++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
++					 u64, unsigned, u64, u64);
++
++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
++
++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
+diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
+new file mode 100644
+index 000000000000..fea7f944d0ed
+--- /dev/null
++++ b/fs/bcachefs/buckets_waiting_for_journal_types.h
+@@ -0,0 +1,23 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
++#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
++
++#include <linux/siphash.h>
++
++struct bucket_hashed {
++	u64			dev_bucket;
++	u64			journal_seq;
++};
++
++struct buckets_waiting_for_journal_table {
++	size_t			size;
++	siphash_key_t		hash_seeds[3];
++	struct bucket_hashed	d[];
++};
++
++struct buckets_waiting_for_journal {
++	struct mutex		lock;
++	struct buckets_waiting_for_journal_table *t;
++};
++
++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
+diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
+new file mode 100644
+index 000000000000..dbb7e5e0b35b
+--- /dev/null
++++ b/fs/bcachefs/chardev.c
+@@ -0,0 +1,760 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifndef NO_BCACHEFS_CHARDEV
++
++#include "bcachefs.h"
++#include "bcachefs_ioctl.h"
++#include "buckets.h"
++#include "chardev.h"
++#include "journal.h"
++#include "move.h"
++#include "replicas.h"
++#include "super.h"
++#include "super-io.h"
++
++#include <linux/anon_inodes.h>
++#include <linux/cdev.h>
++#include <linux/device.h>
++#include <linux/file.h>
++#include <linux/fs.h>
++#include <linux/ioctl.h>
++#include <linux/kthread.h>
++#include <linux/major.h>
++#include <linux/sched/task.h>
++#include <linux/slab.h>
++#include <linux/uaccess.h>
++
++/* returns with ref on ca->ref */
++static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
++					  unsigned flags)
++{
++	struct bch_dev *ca;
++
++	if (flags & BCH_BY_INDEX) {
++		if (dev >= c->sb.nr_devices)
++			return ERR_PTR(-EINVAL);
++
++		rcu_read_lock();
++		ca = rcu_dereference(c->devs[dev]);
++		if (ca)
++			percpu_ref_get(&ca->ref);
++		rcu_read_unlock();
++
++		if (!ca)
++			return ERR_PTR(-EINVAL);
++	} else {
++		char *path;
++
++		path = strndup_user((const char __user *)
++				    (unsigned long) dev, PATH_MAX);
++		if (IS_ERR(path))
++			return ERR_CAST(path);
++
++		ca = bch2_dev_lookup(c, path);
++		kfree(path);
++	}
++
++	return ca;
++}
++
++#if 0
++static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
++{
++	struct bch_ioctl_assemble arg;
++	struct bch_fs *c;
++	u64 *user_devs = NULL;
++	char **devs = NULL;
++	unsigned i;
++	int ret = -EFAULT;
++
++	if (copy_from_user(&arg, user_arg, sizeof(arg)))
++		return -EFAULT;
++
++	if (arg.flags || arg.pad)
++		return -EINVAL;
++
++	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
++	if (!user_devs)
++		return -ENOMEM;
++
++	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
++
++	if (copy_from_user(user_devs, user_arg->devs,
++			   sizeof(u64) * arg.nr_devs))
++		goto err;
++
++	for (i = 0; i < arg.nr_devs; i++) {
++		devs[i] = strndup_user((const char __user *)(unsigned long)
++				       user_devs[i],
++				       PATH_MAX);
++		if (!devs[i]) {
++			ret = -ENOMEM;
++			goto err;
++		}
++	}
++
++	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
++	ret = PTR_ERR_OR_ZERO(c);
++	if (!ret)
++		closure_put(&c->cl);
++err:
++	if (devs)
++		for (i = 0; i < arg.nr_devs; i++)
++			kfree(devs[i]);
++	kfree(devs);
++	return ret;
++}
++
++static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
++{
++	struct bch_ioctl_incremental arg;
++	const char *err;
++	char *path;
++
++	if (copy_from_user(&arg, user_arg, sizeof(arg)))
++		return -EFAULT;
++
++	if (arg.flags || arg.pad)
++		return -EINVAL;
++
++	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
++	if (!path)
++		return -ENOMEM;
++
++	err = bch2_fs_open_incremental(path);
++	kfree(path);
++
++	if (err) {
++		pr_err("Could not register bcachefs devices: %s", err);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++#endif
++
++static long bch2_global_ioctl(unsigned cmd, void __user *arg)
++{
++	switch (cmd) {
++#if 0
++	case BCH_IOCTL_ASSEMBLE:
++		return bch2_ioctl_assemble(arg);
++	case BCH_IOCTL_INCREMENTAL:
++		return bch2_ioctl_incremental(arg);
++#endif
++	default:
++		return -ENOTTY;
++	}
++}
++
++static long bch2_ioctl_query_uuid(struct bch_fs *c,
++			struct bch_ioctl_query_uuid __user *user_arg)
++{
++	return copy_to_user(&user_arg->uuid,
++			    &c->sb.user_uuid,
++			    sizeof(c->sb.user_uuid));
++}
++
++#if 0
++static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
++{
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if (arg.flags || arg.pad)
++		return -EINVAL;
++
++	return bch2_fs_start(c);
++}
++
++static long bch2_ioctl_stop(struct bch_fs *c)
++{
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	bch2_fs_stop(c);
++	return 0;
++}
++#endif
++
++static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
++{
++	char *path;
++	int ret;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if (arg.flags || arg.pad)
++		return -EINVAL;
++
++	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
++	if (!path)
++		return -ENOMEM;
++
++	ret = bch2_dev_add(c, path);
++	kfree(path);
++
++	return ret;
++}
++
++static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
++{
++	struct bch_dev *ca;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
++			   BCH_FORCE_IF_METADATA_LOST|
++			   BCH_FORCE_IF_DEGRADED|
++			   BCH_BY_INDEX)) ||
++	    arg.pad)
++		return -EINVAL;
++
++	ca = bch2_device_lookup(c, arg.dev, arg.flags);
++	if (IS_ERR(ca))
++		return PTR_ERR(ca);
++
++	return bch2_dev_remove(c, ca, arg.flags);
++}
++
++static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
++{
++	char *path;
++	int ret;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if (arg.flags || arg.pad)
++		return -EINVAL;
++
++	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
++	if (!path)
++		return -ENOMEM;
++
++	ret = bch2_dev_online(c, path);
++	kfree(path);
++	return ret;
++}
++
++static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
++{
++	struct bch_dev *ca;
++	int ret;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
++			   BCH_FORCE_IF_METADATA_LOST|
++			   BCH_FORCE_IF_DEGRADED|
++			   BCH_BY_INDEX)) ||
++	    arg.pad)
++		return -EINVAL;
++
++	ca = bch2_device_lookup(c, arg.dev, arg.flags);
++	if (IS_ERR(ca))
++		return PTR_ERR(ca);
++
++	ret = bch2_dev_offline(c, ca, arg.flags);
++	percpu_ref_put(&ca->ref);
++	return ret;
++}
++
++static long bch2_ioctl_disk_set_state(struct bch_fs *c,
++			struct bch_ioctl_disk_set_state arg)
++{
++	struct bch_dev *ca;
++	int ret;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
++			   BCH_FORCE_IF_METADATA_LOST|
++			   BCH_FORCE_IF_DEGRADED|
++			   BCH_BY_INDEX)) ||
++	    arg.pad[0] || arg.pad[1] || arg.pad[2] ||
++	    arg.new_state >= BCH_MEMBER_STATE_NR)
++		return -EINVAL;
++
++	ca = bch2_device_lookup(c, arg.dev, arg.flags);
++	if (IS_ERR(ca))
++		return PTR_ERR(ca);
++
++	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
++
++	percpu_ref_put(&ca->ref);
++	return ret;
++}
++
++struct bch_data_ctx {
++	struct bch_fs			*c;
++	struct bch_ioctl_data		arg;
++	struct bch_move_stats		stats;
++
++	int				ret;
++
++	struct task_struct		*thread;
++};
++
++static int bch2_data_thread(void *arg)
++{
++	struct bch_data_ctx *ctx = arg;
++
++	ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
++
++	ctx->stats.data_type = U8_MAX;
++	return 0;
++}
++
++static int bch2_data_job_release(struct inode *inode, struct file *file)
++{
++	struct bch_data_ctx *ctx = file->private_data;
++
++	kthread_stop(ctx->thread);
++	put_task_struct(ctx->thread);
++	kfree(ctx);
++	return 0;
++}
++
++static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
++				  size_t len, loff_t *ppos)
++{
++	struct bch_data_ctx *ctx = file->private_data;
++	struct bch_fs *c = ctx->c;
++	struct bch_ioctl_data_event e = {
++		.type			= BCH_DATA_EVENT_PROGRESS,
++		.p.data_type		= ctx->stats.data_type,
++		.p.btree_id		= ctx->stats.btree_id,
++		.p.pos			= ctx->stats.pos,
++		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
++		.p.sectors_total	= bch2_fs_usage_read_short(c).used,
++	};
++
++	if (len < sizeof(e))
++		return -EINVAL;
++
++	return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
++}
++
++static const struct file_operations bcachefs_data_ops = {
++	.release	= bch2_data_job_release,
++	.read		= bch2_data_job_read,
++	.llseek		= no_llseek,
++};
++
++static long bch2_ioctl_data(struct bch_fs *c,
++			    struct bch_ioctl_data arg)
++{
++	struct bch_data_ctx *ctx = NULL;
++	struct file *file = NULL;
++	unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
++	int ret, fd = -1;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if (arg.op >= BCH_DATA_OP_NR || arg.flags)
++		return -EINVAL;
++
++	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
++	if (!ctx)
++		return -ENOMEM;
++
++	ctx->c = c;
++	ctx->arg = arg;
++
++	ctx->thread = kthread_create(bch2_data_thread, ctx,
++				     "bch-data/%s", c->name);
++	if (IS_ERR(ctx->thread)) {
++		ret = PTR_ERR(ctx->thread);
++		goto err;
++	}
++
++	ret = get_unused_fd_flags(flags);
++	if (ret < 0)
++		goto err;
++	fd = ret;
++
++	file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
++	if (IS_ERR(file)) {
++		ret = PTR_ERR(file);
++		goto err;
++	}
++
++	fd_install(fd, file);
++
++	get_task_struct(ctx->thread);
++	wake_up_process(ctx->thread);
++
++	return fd;
++err:
++	if (fd >= 0)
++		put_unused_fd(fd);
++	if (!IS_ERR_OR_NULL(ctx->thread))
++		kthread_stop(ctx->thread);
++	kfree(ctx);
++	return ret;
++}
++
++static long bch2_ioctl_fs_usage(struct bch_fs *c,
++				struct bch_ioctl_fs_usage __user *user_arg)
++{
++	struct bch_ioctl_fs_usage *arg = NULL;
++	struct bch_replicas_usage *dst_e, *dst_end;
++	struct bch_fs_usage_online *src;
++	u32 replica_entries_bytes;
++	unsigned i;
++	int ret = 0;
++
++	if (!test_bit(BCH_FS_STARTED, &c->flags))
++		return -EINVAL;
++
++	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
++		return -EFAULT;
++
++	arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL);
++	if (!arg)
++		return -ENOMEM;
++
++	src = bch2_fs_usage_read(c);
++	if (!src) {
++		ret = -ENOMEM;
++		goto err;
++	}
++
++	arg->capacity		= c->capacity;
++	arg->used		= bch2_fs_sectors_used(c, src);
++	arg->online_reserved	= src->online_reserved;
++
++	for (i = 0; i < BCH_REPLICAS_MAX; i++)
++		arg->persistent_reserved[i] = src->u.persistent_reserved[i];
++
++	dst_e	= arg->replicas;
++	dst_end = (void *) arg->replicas + replica_entries_bytes;
++
++	for (i = 0; i < c->replicas.nr; i++) {
++		struct bch_replicas_entry *src_e =
++			cpu_replicas_entry(&c->replicas, i);
++
++		/* check that we have enough space for one replicas entry */
++		if (dst_e + 1 > dst_end) {
++			ret = -ERANGE;
++			break;
++		}
++
++		dst_e->sectors		= src->u.replicas[i];
++		dst_e->r		= *src_e;
++
++		/* recheck after setting nr_devs: */
++		if (replicas_usage_next(dst_e) > dst_end) {
++			ret = -ERANGE;
++			break;
++		}
++
++		memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
++
++		dst_e = replicas_usage_next(dst_e);
++	}
++
++	arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
++
++	percpu_up_read(&c->mark_lock);
++	kfree(src);
++
++	if (!ret)
++		ret = copy_to_user(user_arg, arg,
++			sizeof(*arg) + arg->replica_entries_bytes);
++err:
++	kfree(arg);
++	return ret;
++}
++
++static long bch2_ioctl_dev_usage(struct bch_fs *c,
++				 struct bch_ioctl_dev_usage __user *user_arg)
++{
++	struct bch_ioctl_dev_usage arg;
++	struct bch_dev_usage src;
++	struct bch_dev *ca;
++	unsigned i;
++
++	if (!test_bit(BCH_FS_STARTED, &c->flags))
++		return -EINVAL;
++
++	if (copy_from_user(&arg, user_arg, sizeof(arg)))
++		return -EFAULT;
++
++	if ((arg.flags & ~BCH_BY_INDEX) ||
++	    arg.pad[0] ||
++	    arg.pad[1] ||
++	    arg.pad[2])
++		return -EINVAL;
++
++	ca = bch2_device_lookup(c, arg.dev, arg.flags);
++	if (IS_ERR(ca))
++		return PTR_ERR(ca);
++
++	src = bch2_dev_usage_read(ca);
++
++	arg.state		= ca->mi.state;
++	arg.bucket_size		= ca->mi.bucket_size;
++	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
++	arg.buckets_ec		= src.buckets_ec;
++
++	for (i = 0; i < BCH_DATA_NR; i++) {
++		arg.d[i].buckets	= src.d[i].buckets;
++		arg.d[i].sectors	= src.d[i].sectors;
++		arg.d[i].fragmented	= src.d[i].fragmented;
++	}
++
++	percpu_ref_put(&ca->ref);
++
++	return copy_to_user(user_arg, &arg, sizeof(arg));
++}
++
++static long bch2_ioctl_read_super(struct bch_fs *c,
++				  struct bch_ioctl_read_super arg)
++{
++	struct bch_dev *ca = NULL;
++	struct bch_sb *sb;
++	int ret = 0;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
++	    arg.pad)
++		return -EINVAL;
++
++	mutex_lock(&c->sb_lock);
++
++	if (arg.flags & BCH_READ_DEV) {
++		ca = bch2_device_lookup(c, arg.dev, arg.flags);
++
++		if (IS_ERR(ca)) {
++			ret = PTR_ERR(ca);
++			goto err;
++		}
++
++		sb = ca->disk_sb.sb;
++	} else {
++		sb = c->disk_sb.sb;
++	}
++
++	if (vstruct_bytes(sb) > arg.size) {
++		ret = -ERANGE;
++		goto err;
++	}
++
++	ret = copy_to_user((void __user *)(unsigned long)arg.sb,
++			   sb, vstruct_bytes(sb));
++err:
++	if (!IS_ERR_OR_NULL(ca))
++		percpu_ref_put(&ca->ref);
++	mutex_unlock(&c->sb_lock);
++	return ret;
++}
++
++static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
++				    struct bch_ioctl_disk_get_idx arg)
++{
++	dev_t dev = huge_decode_dev(arg.dev);
++	struct bch_dev *ca;
++	unsigned i;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if (!dev)
++		return -EINVAL;
++
++	for_each_online_member(ca, c, i)
++		if (ca->dev == dev) {
++			percpu_ref_put(&ca->io_ref);
++			return i;
++		}
++
++	return -ENOENT;
++}
++
++static long bch2_ioctl_disk_resize(struct bch_fs *c,
++				   struct bch_ioctl_disk_resize arg)
++{
++	struct bch_dev *ca;
++	int ret;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if ((arg.flags & ~BCH_BY_INDEX) ||
++	    arg.pad)
++		return -EINVAL;
++
++	ca = bch2_device_lookup(c, arg.dev, arg.flags);
++	if (IS_ERR(ca))
++		return PTR_ERR(ca);
++
++	ret = bch2_dev_resize(c, ca, arg.nbuckets);
++
++	percpu_ref_put(&ca->ref);
++	return ret;
++}
++
++static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
++				   struct bch_ioctl_disk_resize_journal arg)
++{
++	struct bch_dev *ca;
++	int ret;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if ((arg.flags & ~BCH_BY_INDEX) ||
++	    arg.pad)
++		return -EINVAL;
++
++	ca = bch2_device_lookup(c, arg.dev, arg.flags);
++	if (IS_ERR(ca))
++		return PTR_ERR(ca);
++
++	ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
++
++	percpu_ref_put(&ca->ref);
++	return ret;
++}
++
++#define BCH_IOCTL(_name, _argtype)					\
++do {									\
++	_argtype i;							\
++									\
++	if (copy_from_user(&i, arg, sizeof(i)))				\
++		return -EFAULT;						\
++	return bch2_ioctl_##_name(c, i);				\
++} while (0)
++
++long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
++{
++	switch (cmd) {
++	case BCH_IOCTL_QUERY_UUID:
++		return bch2_ioctl_query_uuid(c, arg);
++	case BCH_IOCTL_FS_USAGE:
++		return bch2_ioctl_fs_usage(c, arg);
++	case BCH_IOCTL_DEV_USAGE:
++		return bch2_ioctl_dev_usage(c, arg);
++#if 0
++	case BCH_IOCTL_START:
++		BCH_IOCTL(start, struct bch_ioctl_start);
++	case BCH_IOCTL_STOP:
++		return bch2_ioctl_stop(c);
++#endif
++	case BCH_IOCTL_READ_SUPER:
++		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
++	case BCH_IOCTL_DISK_GET_IDX:
++		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
++	}
++
++	if (!test_bit(BCH_FS_STARTED, &c->flags))
++		return -EINVAL;
++
++	switch (cmd) {
++	case BCH_IOCTL_DISK_ADD:
++		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
++	case BCH_IOCTL_DISK_REMOVE:
++		BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
++	case BCH_IOCTL_DISK_ONLINE:
++		BCH_IOCTL(disk_online, struct bch_ioctl_disk);
++	case BCH_IOCTL_DISK_OFFLINE:
++		BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
++	case BCH_IOCTL_DISK_SET_STATE:
++		BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
++	case BCH_IOCTL_DATA:
++		BCH_IOCTL(data, struct bch_ioctl_data);
++	case BCH_IOCTL_DISK_RESIZE:
++		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
++	case BCH_IOCTL_DISK_RESIZE_JOURNAL:
++		BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
++
++	default:
++		return -ENOTTY;
++	}
++}
++
++static DEFINE_IDR(bch_chardev_minor);
++
++static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
++{
++	unsigned minor = iminor(file_inode(filp));
++	struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
++	void __user *arg = (void __user *) v;
++
++	return c
++		? bch2_fs_ioctl(c, cmd, arg)
++		: bch2_global_ioctl(cmd, arg);
++}
++
++static const struct file_operations bch_chardev_fops = {
++	.owner		= THIS_MODULE,
++	.unlocked_ioctl = bch2_chardev_ioctl,
++	.open		= nonseekable_open,
++};
++
++static int bch_chardev_major;
++static struct class *bch_chardev_class;
++static struct device *bch_chardev;
++
++void bch2_fs_chardev_exit(struct bch_fs *c)
++{
++	if (!IS_ERR_OR_NULL(c->chardev))
++		device_unregister(c->chardev);
++	if (c->minor >= 0)
++		idr_remove(&bch_chardev_minor, c->minor);
++}
++
++int bch2_fs_chardev_init(struct bch_fs *c)
++{
++	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
++	if (c->minor < 0)
++		return c->minor;
++
++	c->chardev = device_create(bch_chardev_class, NULL,
++				   MKDEV(bch_chardev_major, c->minor), c,
++				   "bcachefs%u-ctl", c->minor);
++	if (IS_ERR(c->chardev))
++		return PTR_ERR(c->chardev);
++
++	return 0;
++}
++
++void bch2_chardev_exit(void)
++{
++	if (!IS_ERR_OR_NULL(bch_chardev_class))
++		device_destroy(bch_chardev_class,
++			       MKDEV(bch_chardev_major, U8_MAX));
++	if (!IS_ERR_OR_NULL(bch_chardev_class))
++		class_destroy(bch_chardev_class);
++	if (bch_chardev_major > 0)
++		unregister_chrdev(bch_chardev_major, "bcachefs");
++}
++
++int __init bch2_chardev_init(void)
++{
++	bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
++	if (bch_chardev_major < 0)
++		return bch_chardev_major;
++
++	bch_chardev_class = class_create(THIS_MODULE, "bcachefs");
++	if (IS_ERR(bch_chardev_class))
++		return PTR_ERR(bch_chardev_class);
++
++	bch_chardev = device_create(bch_chardev_class, NULL,
++				    MKDEV(bch_chardev_major, U8_MAX),
++				    NULL, "bcachefs-ctl");
++	if (IS_ERR(bch_chardev))
++		return PTR_ERR(bch_chardev);
++
++	return 0;
++}
++
++#endif /* NO_BCACHEFS_CHARDEV */
+diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
+new file mode 100644
+index 000000000000..3a4890d39ff9
+--- /dev/null
++++ b/fs/bcachefs/chardev.h
+@@ -0,0 +1,31 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_CHARDEV_H
++#define _BCACHEFS_CHARDEV_H
++
++#ifndef NO_BCACHEFS_FS
++
++long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
++
++void bch2_fs_chardev_exit(struct bch_fs *);
++int bch2_fs_chardev_init(struct bch_fs *);
++
++void bch2_chardev_exit(void);
++int __init bch2_chardev_init(void);
++
++#else
++
++static inline long bch2_fs_ioctl(struct bch_fs *c,
++				unsigned cmd, void __user * arg)
++{
++	return -ENOSYS;
++}
++
++static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
++static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
++
++static inline void bch2_chardev_exit(void) {}
++static inline int __init bch2_chardev_init(void) { return 0; }
++
++#endif /* NO_BCACHEFS_FS */
++
++#endif /* _BCACHEFS_CHARDEV_H */
+diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
+new file mode 100644
+index 000000000000..b5850a761b91
+--- /dev/null
++++ b/fs/bcachefs/checksum.c
+@@ -0,0 +1,712 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "checksum.h"
++#include "errcode.h"
++#include "super.h"
++#include "super-io.h"
++
++#include <linux/crc32c.h>
++#include <linux/crypto.h>
++#include <linux/xxhash.h>
++#include <linux/key.h>
++#include <linux/random.h>
++#include <linux/scatterlist.h>
++#include <crypto/algapi.h>
++#include <crypto/chacha.h>
++#include <crypto/hash.h>
++#include <crypto/poly1305.h>
++#include <crypto/skcipher.h>
++#include <keys/user-type.h>
++
++/*
++ * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
++ * it features page merging without having the checksum algorithm lose its state.
++ * for native checksum aglorithms (like crc), a default seed value will do.
++ * for hash-like algorithms, a state needs to be stored
++ */
++
++struct bch2_checksum_state {
++	union {
++		u64 seed;
++		struct xxh64_state h64state;
++	};
++	unsigned int type;
++};
++
++static void bch2_checksum_init(struct bch2_checksum_state *state)
++{
++	switch (state->type) {
++	case BCH_CSUM_none:
++	case BCH_CSUM_crc32c:
++	case BCH_CSUM_crc64:
++		state->seed = 0;
++		break;
++	case BCH_CSUM_crc32c_nonzero:
++		state->seed = U32_MAX;
++		break;
++	case BCH_CSUM_crc64_nonzero:
++		state->seed = U64_MAX;
++		break;
++	case BCH_CSUM_xxhash:
++		xxh64_reset(&state->h64state, 0);
++		break;
++	default:
++		BUG();
++	}
++}
++
++static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
++{
++	switch (state->type) {
++	case BCH_CSUM_none:
++	case BCH_CSUM_crc32c:
++	case BCH_CSUM_crc64:
++		return state->seed;
++	case BCH_CSUM_crc32c_nonzero:
++		return state->seed ^ U32_MAX;
++	case BCH_CSUM_crc64_nonzero:
++		return state->seed ^ U64_MAX;
++	case BCH_CSUM_xxhash:
++		return xxh64_digest(&state->h64state);
++	default:
++		BUG();
++	}
++}
++
++static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
++{
++	switch (state->type) {
++	case BCH_CSUM_none:
++		return;
++	case BCH_CSUM_crc32c_nonzero:
++	case BCH_CSUM_crc32c:
++		state->seed = crc32c(state->seed, data, len);
++		break;
++	case BCH_CSUM_crc64_nonzero:
++	case BCH_CSUM_crc64:
++		state->seed = crc64_be(state->seed, data, len);
++		break;
++	case BCH_CSUM_xxhash:
++		xxh64_update(&state->h64state, data, len);
++		break;
++	default:
++		BUG();
++	}
++}
++
++static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
++				struct nonce nonce,
++				struct scatterlist *sg, size_t len)
++{
++	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
++	int ret;
++
++	skcipher_request_set_sync_tfm(req, tfm);
++	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
++
++	ret = crypto_skcipher_encrypt(req);
++	if (ret)
++		pr_err("got error %i from crypto_skcipher_encrypt()", ret);
++
++	return ret;
++}
++
++static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
++			      struct nonce nonce,
++			      void *buf, size_t len)
++{
++	if (!is_vmalloc_addr(buf)) {
++		struct scatterlist sg;
++
++		sg_init_table(&sg, 1);
++		sg_set_page(&sg,
++			    is_vmalloc_addr(buf)
++			    ? vmalloc_to_page(buf)
++			    : virt_to_page(buf),
++			    len, offset_in_page(buf));
++		return do_encrypt_sg(tfm, nonce, &sg, len);
++	} else {
++		unsigned pages = buf_pages(buf, len);
++		struct scatterlist *sg;
++		size_t orig_len = len;
++		int ret, i;
++
++		sg = kmalloc_array(sizeof(*sg), pages, GFP_KERNEL);
++		if (!sg)
++			return -ENOMEM;
++
++		sg_init_table(sg, pages);
++
++		for (i = 0; i < pages; i++) {
++			unsigned offset = offset_in_page(buf);
++			unsigned pg_len = min(len, PAGE_SIZE - offset);
++
++			sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
++			buf += pg_len;
++			len -= pg_len;
++		}
++
++		ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
++		kfree(sg);
++		return ret;
++	}
++}
++
++int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
++			    void *buf, size_t len)
++{
++	struct crypto_sync_skcipher *chacha20 =
++		crypto_alloc_sync_skcipher("chacha20", 0, 0);
++	int ret;
++
++	if (!chacha20) {
++		pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
++		return PTR_ERR(chacha20);
++	}
++
++	ret = crypto_skcipher_setkey(&chacha20->base,
++				     (void *) key, sizeof(*key));
++	if (ret) {
++		pr_err("crypto_skcipher_setkey() error: %i", ret);
++		goto err;
++	}
++
++	ret = do_encrypt(chacha20, nonce, buf, len);
++err:
++	crypto_free_sync_skcipher(chacha20);
++	return ret;
++}
++
++static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
++			struct nonce nonce)
++{
++	u8 key[POLY1305_KEY_SIZE];
++	int ret;
++
++	nonce.d[3] ^= BCH_NONCE_POLY;
++
++	memset(key, 0, sizeof(key));
++	ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
++	if (ret)
++		return ret;
++
++	desc->tfm = c->poly1305;
++	crypto_shash_init(desc);
++	crypto_shash_update(desc, key, sizeof(key));
++	return 0;
++}
++
++struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
++			      struct nonce nonce, const void *data, size_t len)
++{
++	switch (type) {
++	case BCH_CSUM_none:
++	case BCH_CSUM_crc32c_nonzero:
++	case BCH_CSUM_crc64_nonzero:
++	case BCH_CSUM_crc32c:
++	case BCH_CSUM_xxhash:
++	case BCH_CSUM_crc64: {
++		struct bch2_checksum_state state;
++
++		state.type = type;
++
++		bch2_checksum_init(&state);
++		bch2_checksum_update(&state, data, len);
++
++		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
++	}
++
++	case BCH_CSUM_chacha20_poly1305_80:
++	case BCH_CSUM_chacha20_poly1305_128: {
++		SHASH_DESC_ON_STACK(desc, c->poly1305);
++		u8 digest[POLY1305_DIGEST_SIZE];
++		struct bch_csum ret = { 0 };
++
++		gen_poly_key(c, desc, nonce);
++
++		crypto_shash_update(desc, data, len);
++		crypto_shash_final(desc, digest);
++
++		memcpy(&ret, digest, bch_crc_bytes[type]);
++		return ret;
++	}
++	default:
++		BUG();
++	}
++}
++
++int bch2_encrypt(struct bch_fs *c, unsigned type,
++		  struct nonce nonce, void *data, size_t len)
++{
++	if (!bch2_csum_type_is_encryption(type))
++		return 0;
++
++	return do_encrypt(c->chacha20, nonce, data, len);
++}
++
++static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
++					   struct nonce nonce, struct bio *bio,
++					   struct bvec_iter *iter)
++{
++	struct bio_vec bv;
++
++	switch (type) {
++	case BCH_CSUM_none:
++		return (struct bch_csum) { 0 };
++	case BCH_CSUM_crc32c_nonzero:
++	case BCH_CSUM_crc64_nonzero:
++	case BCH_CSUM_crc32c:
++	case BCH_CSUM_xxhash:
++	case BCH_CSUM_crc64: {
++		struct bch2_checksum_state state;
++
++		state.type = type;
++		bch2_checksum_init(&state);
++
++#ifdef CONFIG_HIGHMEM
++		__bio_for_each_segment(bv, bio, *iter, *iter) {
++			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
++			bch2_checksum_update(&state, p, bv.bv_len);
++			kunmap_atomic(p);
++		}
++#else
++		__bio_for_each_bvec(bv, bio, *iter, *iter)
++			bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
++				bv.bv_len);
++#endif
++		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
++	}
++
++	case BCH_CSUM_chacha20_poly1305_80:
++	case BCH_CSUM_chacha20_poly1305_128: {
++		SHASH_DESC_ON_STACK(desc, c->poly1305);
++		u8 digest[POLY1305_DIGEST_SIZE];
++		struct bch_csum ret = { 0 };
++
++		gen_poly_key(c, desc, nonce);
++
++#ifdef CONFIG_HIGHMEM
++		__bio_for_each_segment(bv, bio, *iter, *iter) {
++			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
++
++			crypto_shash_update(desc, p, bv.bv_len);
++			kunmap_atomic(p);
++		}
++#else
++		__bio_for_each_bvec(bv, bio, *iter, *iter)
++			crypto_shash_update(desc,
++				page_address(bv.bv_page) + bv.bv_offset,
++				bv.bv_len);
++#endif
++		crypto_shash_final(desc, digest);
++
++		memcpy(&ret, digest, bch_crc_bytes[type]);
++		return ret;
++	}
++	default:
++		BUG();
++	}
++}
++
++struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
++				  struct nonce nonce, struct bio *bio)
++{
++	struct bvec_iter iter = bio->bi_iter;
++
++	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
++}
++
++int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
++		     struct nonce nonce, struct bio *bio)
++{
++	struct bio_vec bv;
++	struct bvec_iter iter;
++	struct scatterlist sgl[16], *sg = sgl;
++	size_t bytes = 0;
++	int ret = 0;
++
++	if (!bch2_csum_type_is_encryption(type))
++		return 0;
++
++	sg_init_table(sgl, ARRAY_SIZE(sgl));
++
++	bio_for_each_segment(bv, bio, iter) {
++		if (sg == sgl + ARRAY_SIZE(sgl)) {
++			sg_mark_end(sg - 1);
++
++			ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
++			if (ret)
++				return ret;
++
++			nonce = nonce_add(nonce, bytes);
++			bytes = 0;
++
++			sg_init_table(sgl, ARRAY_SIZE(sgl));
++			sg = sgl;
++		}
++
++		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
++		bytes += bv.bv_len;
++	}
++
++	sg_mark_end(sg - 1);
++	return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
++}
++
++struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
++				    struct bch_csum b, size_t b_len)
++{
++	struct bch2_checksum_state state;
++
++	state.type = type;
++	bch2_checksum_init(&state);
++	state.seed = a.lo;
++
++	BUG_ON(!bch2_checksum_mergeable(type));
++
++	while (b_len) {
++		unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
++
++		bch2_checksum_update(&state,
++				page_address(ZERO_PAGE(0)), b);
++		b_len -= b;
++	}
++	a.lo = bch2_checksum_final(&state);
++	a.lo ^= b.lo;
++	a.hi ^= b.hi;
++	return a;
++}
++
++int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
++			struct bversion version,
++			struct bch_extent_crc_unpacked crc_old,
++			struct bch_extent_crc_unpacked *crc_a,
++			struct bch_extent_crc_unpacked *crc_b,
++			unsigned len_a, unsigned len_b,
++			unsigned new_csum_type)
++{
++	struct bvec_iter iter = bio->bi_iter;
++	struct nonce nonce = extent_nonce(version, crc_old);
++	struct bch_csum merged = { 0 };
++	struct crc_split {
++		struct bch_extent_crc_unpacked	*crc;
++		unsigned			len;
++		unsigned			csum_type;
++		struct bch_csum			csum;
++	} splits[3] = {
++		{ crc_a, len_a, new_csum_type },
++		{ crc_b, len_b, new_csum_type },
++		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type },
++	}, *i;
++	bool mergeable = crc_old.csum_type == new_csum_type &&
++		bch2_checksum_mergeable(new_csum_type);
++	unsigned crc_nonce = crc_old.nonce;
++
++	BUG_ON(len_a + len_b > bio_sectors(bio));
++	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
++	BUG_ON(crc_is_compressed(crc_old));
++	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
++	       bch2_csum_type_is_encryption(new_csum_type));
++
++	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
++		iter.bi_size = i->len << 9;
++		if (mergeable || i->crc)
++			i->csum = __bch2_checksum_bio(c, i->csum_type,
++						      nonce, bio, &iter);
++		else
++			bio_advance_iter(bio, &iter, i->len << 9);
++		nonce = nonce_add(nonce, i->len << 9);
++	}
++
++	if (mergeable)
++		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
++			merged = bch2_checksum_merge(new_csum_type, merged,
++						     i->csum, i->len << 9);
++	else
++		merged = bch2_checksum_bio(c, crc_old.csum_type,
++				extent_nonce(version, crc_old), bio);
++
++	if (bch2_crc_cmp(merged, crc_old.csum)) {
++		bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
++			"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
++			crc_old.csum.hi,
++			crc_old.csum.lo,
++			merged.hi,
++			merged.lo,
++			bch2_csum_types[crc_old.csum_type],
++			bch2_csum_types[new_csum_type]);
++		return -EIO;
++	}
++
++	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
++		if (i->crc)
++			*i->crc = (struct bch_extent_crc_unpacked) {
++				.csum_type		= i->csum_type,
++				.compression_type	= crc_old.compression_type,
++				.compressed_size	= i->len,
++				.uncompressed_size	= i->len,
++				.offset			= 0,
++				.live_size		= i->len,
++				.nonce			= crc_nonce,
++				.csum			= i->csum,
++			};
++
++		if (bch2_csum_type_is_encryption(new_csum_type))
++			crc_nonce += i->len;
++	}
++
++	return 0;
++}
++
++#ifdef __KERNEL__
++static int __bch2_request_key(char *key_description, struct bch_key *key)
++{
++	struct key *keyring_key;
++	const struct user_key_payload *ukp;
++	int ret;
++
++	keyring_key = request_key(&key_type_user, key_description, NULL);
++	if (IS_ERR(keyring_key))
++		return PTR_ERR(keyring_key);
++
++	down_read(&keyring_key->sem);
++	ukp = dereference_key_locked(keyring_key);
++	if (ukp->datalen == sizeof(*key)) {
++		memcpy(key, ukp->data, ukp->datalen);
++		ret = 0;
++	} else {
++		ret = -EINVAL;
++	}
++	up_read(&keyring_key->sem);
++	key_put(keyring_key);
++
++	return ret;
++}
++#else
++#include <keyutils.h>
++
++static int __bch2_request_key(char *key_description, struct bch_key *key)
++{
++	key_serial_t key_id;
++
++	key_id = request_key("user", key_description, NULL,
++			     KEY_SPEC_USER_KEYRING);
++	if (key_id < 0)
++		return -errno;
++
++	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
++		return -1;
++
++	return 0;
++}
++#endif
++
++int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
++{
++	struct printbuf key_description = PRINTBUF;
++	int ret;
++
++	prt_printf(&key_description, "bcachefs:");
++	pr_uuid(&key_description, sb->user_uuid.b);
++
++	ret = __bch2_request_key(key_description.buf, key);
++	printbuf_exit(&key_description);
++	return ret;
++}
++
++int bch2_decrypt_sb_key(struct bch_fs *c,
++			struct bch_sb_field_crypt *crypt,
++			struct bch_key *key)
++{
++	struct bch_encrypted_key sb_key = crypt->key;
++	struct bch_key user_key;
++	int ret = 0;
++
++	/* is key encrypted? */
++	if (!bch2_key_is_encrypted(&sb_key))
++		goto out;
++
++	ret = bch2_request_key(c->disk_sb.sb, &user_key);
++	if (ret) {
++		bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	/* decrypt real key: */
++	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
++			     &sb_key, sizeof(sb_key));
++	if (ret)
++		goto err;
++
++	if (bch2_key_is_encrypted(&sb_key)) {
++		bch_err(c, "incorrect encryption key");
++		ret = -EINVAL;
++		goto err;
++	}
++out:
++	*key = sb_key.key;
++err:
++	memzero_explicit(&sb_key, sizeof(sb_key));
++	memzero_explicit(&user_key, sizeof(user_key));
++	return ret;
++}
++
++static int bch2_alloc_ciphers(struct bch_fs *c)
++{
++	int ret;
++
++	if (!c->chacha20)
++		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
++	ret = PTR_ERR_OR_ZERO(c->chacha20);
++
++	if (ret) {
++		bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
++		return ret;
++	}
++
++	if (!c->poly1305)
++		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
++	ret = PTR_ERR_OR_ZERO(c->poly1305);
++
++	if (ret) {
++		bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
++		return ret;
++	}
++
++	return 0;
++}
++
++int bch2_disable_encryption(struct bch_fs *c)
++{
++	struct bch_sb_field_crypt *crypt;
++	struct bch_key key;
++	int ret = -EINVAL;
++
++	mutex_lock(&c->sb_lock);
++
++	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
++	if (!crypt)
++		goto out;
++
++	/* is key encrypted? */
++	ret = 0;
++	if (bch2_key_is_encrypted(&crypt->key))
++		goto out;
++
++	ret = bch2_decrypt_sb_key(c, crypt, &key);
++	if (ret)
++		goto out;
++
++	crypt->key.magic	= BCH_KEY_MAGIC;
++	crypt->key.key		= key;
++
++	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
++	bch2_write_super(c);
++out:
++	mutex_unlock(&c->sb_lock);
++
++	return ret;
++}
++
++int bch2_enable_encryption(struct bch_fs *c, bool keyed)
++{
++	struct bch_encrypted_key key;
++	struct bch_key user_key;
++	struct bch_sb_field_crypt *crypt;
++	int ret = -EINVAL;
++
++	mutex_lock(&c->sb_lock);
++
++	/* Do we already have an encryption key? */
++	if (bch2_sb_get_crypt(c->disk_sb.sb))
++		goto err;
++
++	ret = bch2_alloc_ciphers(c);
++	if (ret)
++		goto err;
++
++	key.magic = BCH_KEY_MAGIC;
++	get_random_bytes(&key.key, sizeof(key.key));
++
++	if (keyed) {
++		ret = bch2_request_key(c->disk_sb.sb, &user_key);
++		if (ret) {
++			bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
++			goto err;
++		}
++
++		ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
++					      &key, sizeof(key));
++		if (ret)
++			goto err;
++	}
++
++	ret = crypto_skcipher_setkey(&c->chacha20->base,
++			(void *) &key.key, sizeof(key.key));
++	if (ret)
++		goto err;
++
++	crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
++	if (!crypt) {
++		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
++		goto err;
++	}
++
++	crypt->key = key;
++
++	/* write superblock */
++	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
++	bch2_write_super(c);
++err:
++	mutex_unlock(&c->sb_lock);
++	memzero_explicit(&user_key, sizeof(user_key));
++	memzero_explicit(&key, sizeof(key));
++	return ret;
++}
++
++void bch2_fs_encryption_exit(struct bch_fs *c)
++{
++	if (!IS_ERR_OR_NULL(c->poly1305))
++		crypto_free_shash(c->poly1305);
++	if (!IS_ERR_OR_NULL(c->chacha20))
++		crypto_free_sync_skcipher(c->chacha20);
++	if (!IS_ERR_OR_NULL(c->sha256))
++		crypto_free_shash(c->sha256);
++}
++
++int bch2_fs_encryption_init(struct bch_fs *c)
++{
++	struct bch_sb_field_crypt *crypt;
++	struct bch_key key;
++	int ret = 0;
++
++	pr_verbose_init(c->opts, "");
++
++	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
++	ret = PTR_ERR_OR_ZERO(c->sha256);
++	if (ret) {
++		bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
++		goto out;
++	}
++
++	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
++	if (!crypt)
++		goto out;
++
++	ret = bch2_alloc_ciphers(c);
++	if (ret)
++		goto out;
++
++	ret = bch2_decrypt_sb_key(c, crypt, &key);
++	if (ret)
++		goto out;
++
++	ret = crypto_skcipher_setkey(&c->chacha20->base,
++			(void *) &key.key, sizeof(key.key));
++	if (ret)
++		goto out;
++out:
++	memzero_explicit(&key, sizeof(key));
++	pr_verbose_init(c->opts, "ret %i", ret);
++	return ret;
++}
+diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
+new file mode 100644
+index 000000000000..c86c3c05d620
+--- /dev/null
++++ b/fs/bcachefs/checksum.h
+@@ -0,0 +1,204 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_CHECKSUM_H
++#define _BCACHEFS_CHECKSUM_H
++
++#include "bcachefs.h"
++#include "extents_types.h"
++#include "super-io.h"
++
++#include <linux/crc64.h>
++#include <crypto/chacha.h>
++
++static inline bool bch2_checksum_mergeable(unsigned type)
++{
++
++	switch (type) {
++	case BCH_CSUM_none:
++	case BCH_CSUM_crc32c:
++	case BCH_CSUM_crc64:
++		return true;
++	default:
++		return false;
++	}
++}
++
++struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
++				    struct bch_csum, size_t);
++
++#define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
++#define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
++#define BCH_NONCE_JOURNAL	cpu_to_le32(3 << 28)
++#define BCH_NONCE_PRIO		cpu_to_le32(4 << 28)
++#define BCH_NONCE_POLY		cpu_to_le32(1 << 31)
++
++struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
++			     const void *, size_t);
++
++/*
++ * This is used for various on disk data structures - bch_sb, prio_set, bset,
++ * jset: The checksum is _always_ the first field of these structs
++ */
++#define csum_vstruct(_c, _type, _nonce, _i)				\
++({									\
++	const void *start = ((const void *) (_i)) + sizeof((_i)->csum);	\
++	const void *end = vstruct_end(_i);				\
++									\
++	bch2_checksum(_c, _type, _nonce, start, end - start);		\
++})
++
++int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
++int bch2_request_key(struct bch_sb *, struct bch_key *);
++
++int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
++		 void *data, size_t);
++
++struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
++				  struct nonce, struct bio *);
++
++int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
++			struct bch_extent_crc_unpacked,
++			struct bch_extent_crc_unpacked *,
++			struct bch_extent_crc_unpacked *,
++			unsigned, unsigned, unsigned);
++
++int bch2_encrypt_bio(struct bch_fs *, unsigned,
++		     struct nonce, struct bio *);
++
++int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
++			struct bch_key *);
++
++int bch2_disable_encryption(struct bch_fs *);
++int bch2_enable_encryption(struct bch_fs *, bool);
++
++void bch2_fs_encryption_exit(struct bch_fs *);
++int bch2_fs_encryption_init(struct bch_fs *);
++
++static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
++						       bool data)
++{
++	switch (type) {
++	case BCH_CSUM_OPT_none:
++	     return BCH_CSUM_none;
++	case BCH_CSUM_OPT_crc32c:
++	     return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
++	case BCH_CSUM_OPT_crc64:
++	     return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
++	case BCH_CSUM_OPT_xxhash:
++	     return BCH_CSUM_xxhash;
++	default:
++	     BUG();
++	}
++}
++
++static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
++							 unsigned opt)
++{
++	if (c->sb.encryption_type)
++		return c->opts.wide_macs
++			? BCH_CSUM_chacha20_poly1305_128
++			: BCH_CSUM_chacha20_poly1305_80;
++
++	return bch2_csum_opt_to_type(opt, true);
++}
++
++static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
++{
++	if (c->sb.encryption_type)
++		return BCH_CSUM_chacha20_poly1305_128;
++
++	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
++}
++
++static const unsigned bch2_compression_opt_to_type[] = {
++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
++	BCH_COMPRESSION_OPTS()
++#undef x
++};
++
++static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
++					   unsigned type)
++{
++	if (type >= BCH_CSUM_NR)
++		return false;
++
++	if (bch2_csum_type_is_encryption(type) && !c->chacha20)
++		return false;
++
++	return true;
++}
++
++/* returns true if not equal */
++static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
++{
++	/*
++	 * XXX: need some way of preventing the compiler from optimizing this
++	 * into a form that isn't constant time..
++	 */
++	return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
++}
++
++/* for skipping ahead and encrypting/decrypting at an offset: */
++static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
++{
++	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
++
++	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
++	return nonce;
++}
++
++static inline struct nonce null_nonce(void)
++{
++	struct nonce ret;
++
++	memset(&ret, 0, sizeof(ret));
++	return ret;
++}
++
++static inline struct nonce extent_nonce(struct bversion version,
++					struct bch_extent_crc_unpacked crc)
++{
++	unsigned compression_type = crc_is_compressed(crc)
++		? crc.compression_type
++		: 0;
++	unsigned size = compression_type ? crc.uncompressed_size : 0;
++	struct nonce nonce = (struct nonce) {{
++		[0] = cpu_to_le32(size << 22),
++		[1] = cpu_to_le32(version.lo),
++		[2] = cpu_to_le32(version.lo >> 32),
++		[3] = cpu_to_le32(version.hi|
++				  (compression_type << 24))^BCH_NONCE_EXTENT,
++	}};
++
++	return nonce_add(nonce, crc.nonce << 9);
++}
++
++static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
++{
++	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
++}
++
++static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
++{
++	__le64 magic = __bch2_sb_magic(sb);
++
++	return (struct nonce) {{
++		[0] = 0,
++		[1] = 0,
++		[2] = ((__le32 *) &magic)[0],
++		[3] = ((__le32 *) &magic)[1],
++	}};
++}
++
++static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
++{
++	__le64 magic = bch2_sb_magic(c);
++
++	return (struct nonce) {{
++		[0] = 0,
++		[1] = 0,
++		[2] = ((__le32 *) &magic)[0],
++		[3] = ((__le32 *) &magic)[1],
++	}};
++}
++
++#endif /* _BCACHEFS_CHECKSUM_H */
+diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
+new file mode 100644
+index 000000000000..f3ffdbc38485
+--- /dev/null
++++ b/fs/bcachefs/clock.c
+@@ -0,0 +1,191 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "clock.h"
++
++#include <linux/freezer.h>
++#include <linux/kthread.h>
++#include <linux/preempt.h>
++
++static inline long io_timer_cmp(io_timer_heap *h,
++				struct io_timer *l,
++				struct io_timer *r)
++{
++	return l->expire - r->expire;
++}
++
++void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
++{
++	size_t i;
++
++	spin_lock(&clock->timer_lock);
++
++	if (time_after_eq((unsigned long) atomic64_read(&clock->now),
++			  timer->expire)) {
++		spin_unlock(&clock->timer_lock);
++		timer->fn(timer);
++		return;
++	}
++
++	for (i = 0; i < clock->timers.used; i++)
++		if (clock->timers.data[i] == timer)
++			goto out;
++
++	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
++out:
++	spin_unlock(&clock->timer_lock);
++}
++
++void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
++{
++	size_t i;
++
++	spin_lock(&clock->timer_lock);
++
++	for (i = 0; i < clock->timers.used; i++)
++		if (clock->timers.data[i] == timer) {
++			heap_del(&clock->timers, i, io_timer_cmp, NULL);
++			break;
++		}
++
++	spin_unlock(&clock->timer_lock);
++}
++
++struct io_clock_wait {
++	struct io_timer		io_timer;
++	struct timer_list	cpu_timer;
++	struct task_struct	*task;
++	int			expired;
++};
++
++static void io_clock_wait_fn(struct io_timer *timer)
++{
++	struct io_clock_wait *wait = container_of(timer,
++				struct io_clock_wait, io_timer);
++
++	wait->expired = 1;
++	wake_up_process(wait->task);
++}
++
++static void io_clock_cpu_timeout(struct timer_list *timer)
++{
++	struct io_clock_wait *wait = container_of(timer,
++				struct io_clock_wait, cpu_timer);
++
++	wait->expired = 1;
++	wake_up_process(wait->task);
++}
++
++void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
++{
++	struct io_clock_wait wait;
++
++	/* XXX: calculate sleep time rigorously */
++	wait.io_timer.expire	= until;
++	wait.io_timer.fn	= io_clock_wait_fn;
++	wait.task		= current;
++	wait.expired		= 0;
++	bch2_io_timer_add(clock, &wait.io_timer);
++
++	schedule();
++
++	bch2_io_timer_del(clock, &wait.io_timer);
++}
++
++void bch2_kthread_io_clock_wait(struct io_clock *clock,
++				unsigned long io_until,
++				unsigned long cpu_timeout)
++{
++	bool kthread = (current->flags & PF_KTHREAD) != 0;
++	struct io_clock_wait wait;
++
++	wait.io_timer.expire	= io_until;
++	wait.io_timer.fn	= io_clock_wait_fn;
++	wait.task		= current;
++	wait.expired		= 0;
++	bch2_io_timer_add(clock, &wait.io_timer);
++
++	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
++
++	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
++		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
++
++	while (1) {
++		set_current_state(TASK_INTERRUPTIBLE);
++		if (kthread && kthread_should_stop())
++			break;
++
++		if (wait.expired)
++			break;
++
++		schedule();
++		try_to_freeze();
++	}
++
++	__set_current_state(TASK_RUNNING);
++	del_singleshot_timer_sync(&wait.cpu_timer);
++	destroy_timer_on_stack(&wait.cpu_timer);
++	bch2_io_timer_del(clock, &wait.io_timer);
++}
++
++static struct io_timer *get_expired_timer(struct io_clock *clock,
++					  unsigned long now)
++{
++	struct io_timer *ret = NULL;
++
++	spin_lock(&clock->timer_lock);
++
++	if (clock->timers.used &&
++	    time_after_eq(now, clock->timers.data[0]->expire))
++		heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
++
++	spin_unlock(&clock->timer_lock);
++
++	return ret;
++}
++
++void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
++{
++	struct io_timer *timer;
++	unsigned long now = atomic64_add_return(sectors, &clock->now);
++
++	while ((timer = get_expired_timer(clock, now)))
++		timer->fn(timer);
++}
++
++void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
++{
++	unsigned long now;
++	unsigned i;
++
++	spin_lock(&clock->timer_lock);
++	now = atomic64_read(&clock->now);
++
++	for (i = 0; i < clock->timers.used; i++)
++		prt_printf(out, "%ps:\t%li\n",
++		       clock->timers.data[i]->fn,
++		       clock->timers.data[i]->expire - now);
++	spin_unlock(&clock->timer_lock);
++}
++
++void bch2_io_clock_exit(struct io_clock *clock)
++{
++	free_heap(&clock->timers);
++	free_percpu(clock->pcpu_buf);
++}
++
++int bch2_io_clock_init(struct io_clock *clock)
++{
++	atomic64_set(&clock->now, 0);
++	spin_lock_init(&clock->timer_lock);
++
++	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
++
++	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
++	if (!clock->pcpu_buf)
++		return -ENOMEM;
++
++	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
++		return -ENOMEM;
++
++	return 0;
++}
+diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
+new file mode 100644
+index 000000000000..70a0f7436c84
+--- /dev/null
++++ b/fs/bcachefs/clock.h
+@@ -0,0 +1,38 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_CLOCK_H
++#define _BCACHEFS_CLOCK_H
++
++void bch2_io_timer_add(struct io_clock *, struct io_timer *);
++void bch2_io_timer_del(struct io_clock *, struct io_timer *);
++void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
++				unsigned long);
++
++void __bch2_increment_clock(struct io_clock *, unsigned);
++
++static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
++					int rw)
++{
++	struct io_clock *clock = &c->io_clock[rw];
++
++	if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
++		   IO_CLOCK_PCPU_SECTORS))
++		__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
++}
++
++void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
++
++#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
++({									\
++	long __ret = timeout;						\
++	might_sleep();							\
++	if (!___wait_cond_timeout(condition))				\
++		__ret = __wait_event_timeout(wq, condition, timeout);	\
++	__ret;								\
++})
++
++void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
++
++void bch2_io_clock_exit(struct io_clock *);
++int bch2_io_clock_init(struct io_clock *);
++
++#endif /* _BCACHEFS_CLOCK_H */
+diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
+new file mode 100644
+index 000000000000..5fae0012d808
+--- /dev/null
++++ b/fs/bcachefs/clock_types.h
+@@ -0,0 +1,37 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_CLOCK_TYPES_H
++#define _BCACHEFS_CLOCK_TYPES_H
++
++#include "util.h"
++
++#define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)
++
++/*
++ * Clocks/timers in units of sectors of IO:
++ *
++ * Note - they use percpu batching, so they're only approximate.
++ */
++
++struct io_timer;
++typedef void (*io_timer_fn)(struct io_timer *);
++
++struct io_timer {
++	io_timer_fn		fn;
++	unsigned long		expire;
++};
++
++/* Amount to buffer up on a percpu counter */
++#define IO_CLOCK_PCPU_SECTORS	128
++
++typedef HEAP(struct io_timer *)	io_timer_heap;
++
++struct io_clock {
++	atomic64_t		now;
++	u16 __percpu		*pcpu_buf;
++	unsigned		max_slop;
++
++	spinlock_t		timer_lock;
++	io_timer_heap		timers;
++};
++
++#endif /* _BCACHEFS_CLOCK_TYPES_H */
+diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
+new file mode 100644
+index 000000000000..f692f35a6a98
+--- /dev/null
++++ b/fs/bcachefs/compress.c
+@@ -0,0 +1,639 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "checksum.h"
++#include "compress.h"
++#include "extents.h"
++#include "io.h"
++#include "super-io.h"
++
++#include <linux/lz4.h>
++#include <linux/zlib.h>
++#include <linux/zstd.h>
++
++/* Bounce buffer: */
++struct bbuf {
++	void		*b;
++	enum {
++		BB_NONE,
++		BB_VMAP,
++		BB_KMALLOC,
++		BB_MEMPOOL,
++	}		type;
++	int		rw;
++};
++
++static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
++{
++	void *b;
++
++	BUG_ON(size > c->opts.encoded_extent_max);
++
++	b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
++	if (b)
++		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
++
++	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
++	if (b)
++		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
++
++	BUG();
++}
++
++static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
++{
++	struct bio_vec bv;
++	struct bvec_iter iter;
++	void *expected_start = NULL;
++
++	__bio_for_each_bvec(bv, bio, iter, start) {
++		if (expected_start &&
++		    expected_start != page_address(bv.bv_page) + bv.bv_offset)
++			return false;
++
++		expected_start = page_address(bv.bv_page) +
++			bv.bv_offset + bv.bv_len;
++	}
++
++	return true;
++}
++
++static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
++				       struct bvec_iter start, int rw)
++{
++	struct bbuf ret;
++	struct bio_vec bv;
++	struct bvec_iter iter;
++	unsigned nr_pages = 0;
++	struct page *stack_pages[16];
++	struct page **pages = NULL;
++	void *data;
++
++	BUG_ON(start.bi_size > c->opts.encoded_extent_max);
++
++	if (!PageHighMem(bio_iter_page(bio, start)) &&
++	    bio_phys_contig(bio, start))
++		return (struct bbuf) {
++			.b = page_address(bio_iter_page(bio, start)) +
++				bio_iter_offset(bio, start),
++			.type = BB_NONE, .rw = rw
++		};
++
++	/* check if we can map the pages contiguously: */
++	__bio_for_each_segment(bv, bio, iter, start) {
++		if (iter.bi_size != start.bi_size &&
++		    bv.bv_offset)
++			goto bounce;
++
++		if (bv.bv_len < iter.bi_size &&
++		    bv.bv_offset + bv.bv_len < PAGE_SIZE)
++			goto bounce;
++
++		nr_pages++;
++	}
++
++	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
++
++	pages = nr_pages > ARRAY_SIZE(stack_pages)
++		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
++		: stack_pages;
++	if (!pages)
++		goto bounce;
++
++	nr_pages = 0;
++	__bio_for_each_segment(bv, bio, iter, start)
++		pages[nr_pages++] = bv.bv_page;
++
++	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
++	if (pages != stack_pages)
++		kfree(pages);
++
++	if (data)
++		return (struct bbuf) {
++			.b = data + bio_iter_offset(bio, start),
++			.type = BB_VMAP, .rw = rw
++		};
++bounce:
++	ret = __bounce_alloc(c, start.bi_size, rw);
++
++	if (rw == READ)
++		memcpy_from_bio(ret.b, bio, start);
++
++	return ret;
++}
++
++static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
++{
++	return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
++}
++
++static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
++{
++	switch (buf.type) {
++	case BB_NONE:
++		break;
++	case BB_VMAP:
++		vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
++		break;
++	case BB_KMALLOC:
++		kfree(buf.b);
++		break;
++	case BB_MEMPOOL:
++		mempool_free(buf.b, &c->compression_bounce[buf.rw]);
++		break;
++	}
++}
++
++static inline void zlib_set_workspace(z_stream *strm, void *workspace)
++{
++#ifdef __KERNEL__
++	strm->workspace = workspace;
++#endif
++}
++
++static int __bio_uncompress(struct bch_fs *c, struct bio *src,
++			    void *dst_data, struct bch_extent_crc_unpacked crc)
++{
++	struct bbuf src_data = { NULL };
++	size_t src_len = src->bi_iter.bi_size;
++	size_t dst_len = crc.uncompressed_size << 9;
++	void *workspace;
++	int ret;
++
++	src_data = bio_map_or_bounce(c, src, READ);
++
++	switch (crc.compression_type) {
++	case BCH_COMPRESSION_TYPE_lz4_old:
++	case BCH_COMPRESSION_TYPE_lz4:
++		ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
++						  src_len, dst_len, dst_len);
++		if (ret != dst_len)
++			goto err;
++		break;
++	case BCH_COMPRESSION_TYPE_gzip: {
++		z_stream strm = {
++			.next_in	= src_data.b,
++			.avail_in	= src_len,
++			.next_out	= dst_data,
++			.avail_out	= dst_len,
++		};
++
++		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
++
++		zlib_set_workspace(&strm, workspace);
++		zlib_inflateInit2(&strm, -MAX_WBITS);
++		ret = zlib_inflate(&strm, Z_FINISH);
++
++		mempool_free(workspace, &c->decompress_workspace);
++
++		if (ret != Z_STREAM_END)
++			goto err;
++		break;
++	}
++	case BCH_COMPRESSION_TYPE_zstd: {
++		ZSTD_DCtx *ctx;
++		size_t real_src_len = le32_to_cpup(src_data.b);
++
++		if (real_src_len > src_len - 4)
++			goto err;
++
++		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
++		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
++
++		ret = zstd_decompress_dctx(ctx,
++				dst_data,	dst_len,
++				src_data.b + 4, real_src_len);
++
++		mempool_free(workspace, &c->decompress_workspace);
++
++		if (ret != dst_len)
++			goto err;
++		break;
++	}
++	default:
++		BUG();
++	}
++	ret = 0;
++out:
++	bio_unmap_or_unbounce(c, src_data);
++	return ret;
++err:
++	ret = -EIO;
++	goto out;
++}
++
++int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
++				struct bch_extent_crc_unpacked *crc)
++{
++	struct bbuf data = { NULL };
++	size_t dst_len = crc->uncompressed_size << 9;
++
++	/* bio must own its pages: */
++	BUG_ON(!bio->bi_vcnt);
++	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
++
++	if (crc->uncompressed_size << 9	> c->opts.encoded_extent_max ||
++	    crc->compressed_size << 9	> c->opts.encoded_extent_max) {
++		bch_err(c, "error rewriting existing data: extent too big");
++		return -EIO;
++	}
++
++	data = __bounce_alloc(c, dst_len, WRITE);
++
++	if (__bio_uncompress(c, bio, data.b, *crc)) {
++		bch_err(c, "error rewriting existing data: decompression error");
++		bio_unmap_or_unbounce(c, data);
++		return -EIO;
++	}
++
++	/*
++	 * XXX: don't have a good way to assert that the bio was allocated with
++	 * enough space, we depend on bch2_move_extent doing the right thing
++	 */
++	bio->bi_iter.bi_size = crc->live_size << 9;
++
++	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
++
++	crc->csum_type		= 0;
++	crc->compression_type	= 0;
++	crc->compressed_size	= crc->live_size;
++	crc->uncompressed_size	= crc->live_size;
++	crc->offset		= 0;
++	crc->csum		= (struct bch_csum) { 0, 0 };
++
++	bio_unmap_or_unbounce(c, data);
++	return 0;
++}
++
++int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
++		       struct bio *dst, struct bvec_iter dst_iter,
++		       struct bch_extent_crc_unpacked crc)
++{
++	struct bbuf dst_data = { NULL };
++	size_t dst_len = crc.uncompressed_size << 9;
++	int ret = -ENOMEM;
++
++	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
++	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
++		return -EIO;
++
++	dst_data = dst_len == dst_iter.bi_size
++		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
++		: __bounce_alloc(c, dst_len, WRITE);
++
++	ret = __bio_uncompress(c, src, dst_data.b, crc);
++	if (ret)
++		goto err;
++
++	if (dst_data.type != BB_NONE &&
++	    dst_data.type != BB_VMAP)
++		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
++err:
++	bio_unmap_or_unbounce(c, dst_data);
++	return ret;
++}
++
++static int attempt_compress(struct bch_fs *c,
++			    void *workspace,
++			    void *dst, size_t dst_len,
++			    void *src, size_t src_len,
++			    enum bch_compression_type compression_type)
++{
++	switch (compression_type) {
++	case BCH_COMPRESSION_TYPE_lz4: {
++		int len = src_len;
++		int ret = LZ4_compress_destSize(
++				src,		dst,
++				&len,		dst_len,
++				workspace);
++
++		if (len < src_len)
++			return -len;
++
++		return ret;
++	}
++	case BCH_COMPRESSION_TYPE_gzip: {
++		z_stream strm = {
++			.next_in	= src,
++			.avail_in	= src_len,
++			.next_out	= dst,
++			.avail_out	= dst_len,
++		};
++
++		zlib_set_workspace(&strm, workspace);
++		zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
++				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
++				  Z_DEFAULT_STRATEGY);
++
++		if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
++			return 0;
++
++		if (zlib_deflateEnd(&strm) != Z_OK)
++			return 0;
++
++		return strm.total_out;
++	}
++	case BCH_COMPRESSION_TYPE_zstd: {
++		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
++			zstd_cctx_workspace_bound(&c->zstd_params.cParams));
++
++		/*
++		 * ZSTD requires that when we decompress we pass in the exact
++		 * compressed size - rounding it up to the nearest sector
++		 * doesn't work, so we use the first 4 bytes of the buffer for
++		 * that.
++		 *
++		 * Additionally, the ZSTD code seems to have a bug where it will
++		 * write just past the end of the buffer - so subtract a fudge
++		 * factor (7 bytes) from the dst buffer size to account for
++		 * that.
++		 */
++		size_t len = zstd_compress_cctx(ctx,
++				dst + 4,	dst_len - 4 - 7,
++				src,		src_len,
++				&c->zstd_params);
++		if (zstd_is_error(len))
++			return 0;
++
++		*((__le32 *) dst) = cpu_to_le32(len);
++		return len + 4;
++	}
++	default:
++		BUG();
++	}
++}
++
++static unsigned __bio_compress(struct bch_fs *c,
++			       struct bio *dst, size_t *dst_len,
++			       struct bio *src, size_t *src_len,
++			       enum bch_compression_type compression_type)
++{
++	struct bbuf src_data = { NULL }, dst_data = { NULL };
++	void *workspace;
++	unsigned pad;
++	int ret = 0;
++
++	BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
++	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
++
++	/* If it's only one block, don't bother trying to compress: */
++	if (src->bi_iter.bi_size <= c->opts.block_size)
++		return 0;
++
++	dst_data = bio_map_or_bounce(c, dst, WRITE);
++	src_data = bio_map_or_bounce(c, src, READ);
++
++	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
++
++	*src_len = src->bi_iter.bi_size;
++	*dst_len = dst->bi_iter.bi_size;
++
++	/*
++	 * XXX: this algorithm sucks when the compression code doesn't tell us
++	 * how much would fit, like LZ4 does:
++	 */
++	while (1) {
++		if (*src_len <= block_bytes(c)) {
++			ret = -1;
++			break;
++		}
++
++		ret = attempt_compress(c, workspace,
++				       dst_data.b,	*dst_len,
++				       src_data.b,	*src_len,
++				       compression_type);
++		if (ret > 0) {
++			*dst_len = ret;
++			ret = 0;
++			break;
++		}
++
++		/* Didn't fit: should we retry with a smaller amount?  */
++		if (*src_len <= *dst_len) {
++			ret = -1;
++			break;
++		}
++
++		/*
++		 * If ret is negative, it's a hint as to how much data would fit
++		 */
++		BUG_ON(-ret >= *src_len);
++
++		if (ret < 0)
++			*src_len = -ret;
++		else
++			*src_len -= (*src_len - *dst_len) / 2;
++		*src_len = round_down(*src_len, block_bytes(c));
++	}
++
++	mempool_free(workspace, &c->compress_workspace[compression_type]);
++
++	if (ret)
++		goto err;
++
++	/* Didn't get smaller: */
++	if (round_up(*dst_len, block_bytes(c)) >= *src_len)
++		goto err;
++
++	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
++
++	memset(dst_data.b + *dst_len, 0, pad);
++	*dst_len += pad;
++
++	if (dst_data.type != BB_NONE &&
++	    dst_data.type != BB_VMAP)
++		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
++
++	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
++	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
++	BUG_ON(*dst_len & (block_bytes(c) - 1));
++	BUG_ON(*src_len & (block_bytes(c) - 1));
++out:
++	bio_unmap_or_unbounce(c, src_data);
++	bio_unmap_or_unbounce(c, dst_data);
++	return compression_type;
++err:
++	compression_type = BCH_COMPRESSION_TYPE_incompressible;
++	goto out;
++}
++
++unsigned bch2_bio_compress(struct bch_fs *c,
++			   struct bio *dst, size_t *dst_len,
++			   struct bio *src, size_t *src_len,
++			   unsigned compression_type)
++{
++	unsigned orig_dst = dst->bi_iter.bi_size;
++	unsigned orig_src = src->bi_iter.bi_size;
++
++	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
++	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
++				     c->opts.encoded_extent_max);
++	/* Don't generate a bigger output than input: */
++	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
++
++	if (compression_type == BCH_COMPRESSION_TYPE_lz4_old)
++		compression_type = BCH_COMPRESSION_TYPE_lz4;
++
++	compression_type =
++		__bio_compress(c, dst, dst_len, src, src_len, compression_type);
++
++	dst->bi_iter.bi_size = orig_dst;
++	src->bi_iter.bi_size = orig_src;
++	return compression_type;
++}
++
++static int __bch2_fs_compress_init(struct bch_fs *, u64);
++
++#define BCH_FEATURE_none	0
++
++static const unsigned bch2_compression_opt_to_feature[] = {
++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
++	BCH_COMPRESSION_OPTS()
++#undef x
++};
++
++#undef BCH_FEATURE_none
++
++static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
++{
++	int ret = 0;
++
++	if ((c->sb.features & f) == f)
++		return 0;
++
++	mutex_lock(&c->sb_lock);
++
++	if ((c->sb.features & f) == f) {
++		mutex_unlock(&c->sb_lock);
++		return 0;
++	}
++
++	ret = __bch2_fs_compress_init(c, c->sb.features|f);
++	if (ret) {
++		mutex_unlock(&c->sb_lock);
++		return ret;
++	}
++
++	c->disk_sb.sb->features[0] |= cpu_to_le64(f);
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	return 0;
++}
++
++int bch2_check_set_has_compressed_data(struct bch_fs *c,
++				       unsigned compression_type)
++{
++	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
++
++	return compression_type
++		? __bch2_check_set_has_compressed_data(c,
++				1ULL << bch2_compression_opt_to_feature[compression_type])
++		: 0;
++}
++
++void bch2_fs_compress_exit(struct bch_fs *c)
++{
++	unsigned i;
++
++	mempool_exit(&c->decompress_workspace);
++	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
++		mempool_exit(&c->compress_workspace[i]);
++	mempool_exit(&c->compression_bounce[WRITE]);
++	mempool_exit(&c->compression_bounce[READ]);
++}
++
++static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
++{
++	size_t decompress_workspace_size = 0;
++	bool decompress_workspace_needed;
++	ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max);
++	struct {
++		unsigned	feature;
++		unsigned	type;
++		size_t		compress_workspace;
++		size_t		decompress_workspace;
++	} compression_types[] = {
++		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 },
++		{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
++			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
++			zlib_inflate_workspacesize(), },
++		{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
++			zstd_cctx_workspace_bound(&params.cParams),
++			zstd_dctx_workspace_bound() },
++	}, *i;
++	int ret = 0;
++
++	pr_verbose_init(c->opts, "");
++
++	c->zstd_params = params;
++
++	for (i = compression_types;
++	     i < compression_types + ARRAY_SIZE(compression_types);
++	     i++)
++		if (features & (1 << i->feature))
++			goto have_compressed;
++
++	goto out;
++have_compressed:
++
++	if (!mempool_initialized(&c->compression_bounce[READ])) {
++		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
++						  1, c->opts.encoded_extent_max);
++		if (ret)
++			goto out;
++	}
++
++	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
++		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
++						  1, c->opts.encoded_extent_max);
++		if (ret)
++			goto out;
++	}
++
++	for (i = compression_types;
++	     i < compression_types + ARRAY_SIZE(compression_types);
++	     i++) {
++		decompress_workspace_size =
++			max(decompress_workspace_size, i->decompress_workspace);
++
++		if (!(features & (1 << i->feature)))
++			continue;
++
++		if (i->decompress_workspace)
++			decompress_workspace_needed = true;
++
++		if (mempool_initialized(&c->compress_workspace[i->type]))
++			continue;
++
++		ret = mempool_init_kvpmalloc_pool(
++				&c->compress_workspace[i->type],
++				1, i->compress_workspace);
++		if (ret)
++			goto out;
++	}
++
++	if (!mempool_initialized(&c->decompress_workspace)) {
++		ret = mempool_init_kvpmalloc_pool(
++				&c->decompress_workspace,
++				1, decompress_workspace_size);
++		if (ret)
++			goto out;
++	}
++out:
++	pr_verbose_init(c->opts, "ret %i", ret);
++	return ret;
++}
++
++int bch2_fs_compress_init(struct bch_fs *c)
++{
++	u64 f = c->sb.features;
++
++	if (c->opts.compression)
++		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
++
++	if (c->opts.background_compression)
++		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
++
++	return __bch2_fs_compress_init(c, f);
++
++}
+diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
+new file mode 100644
+index 000000000000..4bab1f61b3b5
+--- /dev/null
++++ b/fs/bcachefs/compress.h
+@@ -0,0 +1,18 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_COMPRESS_H
++#define _BCACHEFS_COMPRESS_H
++
++#include "extents_types.h"
++
++int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
++				struct bch_extent_crc_unpacked *);
++int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
++		       struct bvec_iter, struct bch_extent_crc_unpacked);
++unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
++			   struct bio *, size_t *, unsigned);
++
++int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
++void bch2_fs_compress_exit(struct bch_fs *);
++int bch2_fs_compress_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_COMPRESS_H */
+diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
+new file mode 100644
+index 000000000000..745f856e6d3e
+--- /dev/null
++++ b/fs/bcachefs/counters.c
+@@ -0,0 +1,107 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "super-io.h"
++#include "counters.h"
++
++/* BCH_SB_FIELD_counters */
++
++const char * const bch2_counter_names[] = {
++#define x(t, n, ...) (#t),
++	BCH_PERSISTENT_COUNTERS()
++#undef x
++	NULL
++};
++
++static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
++{
++	if (!ctrs)
++		return 0;
++
++	return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
++};
++
++static int bch2_sb_counters_validate(struct bch_sb *sb,
++				     struct bch_sb_field *f,
++				     struct printbuf *err)
++{
++	return 0;
++};
++
++void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
++			      struct bch_sb_field *f)
++{
++	struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
++	unsigned int i;
++	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
++
++	for (i = 0; i < nr; i++) {
++		if (i < BCH_COUNTER_NR)
++			prt_printf(out, "%s", bch2_counter_names[i]);
++		else
++			prt_printf(out, "(unknown)");
++
++		prt_tab(out);
++		prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
++		prt_newline(out);
++	};
++};
++
++int bch2_sb_counters_to_cpu(struct bch_fs *c)
++{
++	struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
++	unsigned int i;
++	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
++	u64 val = 0;
++
++	for (i = 0; i < BCH_COUNTER_NR; i++)
++		c->counters_on_mount[i] = 0;
++
++	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
++		val = le64_to_cpu(ctrs->d[i]);
++		percpu_u64_set(&c->counters[i], val);
++		c->counters_on_mount[i] = val;
++	}
++	return 0;
++};
++
++int bch2_sb_counters_from_cpu(struct bch_fs *c)
++{
++	struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
++	struct bch_sb_field_counters *ret;
++	unsigned int i;
++	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
++
++	if (nr < BCH_COUNTER_NR) {
++		ret = bch2_sb_resize_counters(&c->disk_sb,
++					       sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
++
++		if (ret) {
++			ctrs = ret;
++			nr = bch2_sb_counter_nr_entries(ctrs);
++		}
++	}
++
++
++	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
++		ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
++	return 0;
++}
++
++void bch2_fs_counters_exit(struct bch_fs *c)
++{
++	free_percpu(c->counters);
++}
++
++int bch2_fs_counters_init(struct bch_fs *c)
++{
++	c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
++	if (!c->counters)
++		return -ENOMEM;
++
++	return bch2_sb_counters_to_cpu(c);
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_counters = {
++	.validate	= bch2_sb_counters_validate,
++	.to_text	= bch2_sb_counters_to_text,
++};
+diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h
+new file mode 100644
+index 000000000000..4778aa19bf34
+--- /dev/null
++++ b/fs/bcachefs/counters.h
+@@ -0,0 +1,17 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_COUNTERS_H
++#define _BCACHEFS_COUNTERS_H
++
++#include "bcachefs.h"
++#include "super-io.h"
++
++
++int bch2_sb_counters_to_cpu(struct bch_fs *);
++int bch2_sb_counters_from_cpu(struct bch_fs *);
++
++void bch2_fs_counters_exit(struct bch_fs *);
++int bch2_fs_counters_init(struct bch_fs *);
++
++extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
++
++#endif // _BCACHEFS_COUNTERS_H
+diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
+new file mode 100644
+index 000000000000..519ab9b96e67
+--- /dev/null
++++ b/fs/bcachefs/darray.h
+@@ -0,0 +1,77 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_DARRAY_H
++#define _BCACHEFS_DARRAY_H
++
++/*
++ * Dynamic arrays:
++ *
++ * Inspired by CCAN's darray
++ */
++
++#include "util.h"
++#include <linux/slab.h>
++
++#define DARRAY(type)							\
++struct {								\
++	size_t nr, size;						\
++	type *data;							\
++}
++
++typedef DARRAY(void) darray_void;
++
++static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
++{
++	if (d->nr + more > d->size) {
++		size_t new_size = roundup_pow_of_two(d->nr + more);
++		void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL);
++
++		if (!data)
++			return -ENOMEM;
++
++		d->data	= data;
++		d->size = new_size;
++	}
++
++	return 0;
++}
++
++#define darray_make_room(_d, _more)					\
++	__darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more))
++
++#define darray_top(_d)		((_d).data[(_d).nr])
++
++#define darray_push(_d, _item)						\
++({									\
++	int _ret = darray_make_room((_d), 1);				\
++									\
++	if (!_ret)							\
++		(_d)->data[(_d)->nr++] = (_item);			\
++	_ret;								\
++})
++
++#define darray_insert_item(_d, _pos, _item)				\
++({									\
++	size_t pos = (_pos);						\
++	int _ret = darray_make_room((_d), 1);				\
++									\
++	if (!_ret)							\
++		array_insert_item((_d)->data, (_d)->nr, pos, (_item));	\
++	_ret;								\
++})
++
++#define darray_for_each(_d, _i)						\
++	for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
++
++#define darray_init(_d)							\
++do {									\
++	(_d)->data = NULL;						\
++	(_d)->nr = (_d)->size = 0;					\
++} while (0)
++
++#define darray_exit(_d)							\
++do {									\
++	kfree((_d)->data);						\
++	darray_init(_d);						\
++} while (0)
++
++#endif /* _BCACHEFS_DARRAY_H */
+diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
+new file mode 100644
+index 000000000000..3b442b01ca86
+--- /dev/null
++++ b/fs/bcachefs/data_update.c
+@@ -0,0 +1,376 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "alloc_foreground.h"
++#include "bkey_buf.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "data_update.h"
++#include "ec.h"
++#include "extents.h"
++#include "io.h"
++#include "keylist.h"
++#include "move.h"
++#include "subvolume.h"
++
++#include <trace/events/bcachefs.h>
++
++static int insert_snapshot_whiteouts(struct btree_trans *trans,
++				     enum btree_id id,
++				     struct bpos old_pos,
++				     struct bpos new_pos)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter, update_iter;
++	struct bkey_s_c k;
++	snapshot_id_list s;
++	int ret;
++
++	if (!btree_type_has_snapshots(id))
++		return 0;
++
++	darray_init(&s);
++
++	if (!bkey_cmp(old_pos, new_pos))
++		return 0;
++
++	if (!snapshot_t(c, old_pos.snapshot)->children[0])
++		return 0;
++
++	bch2_trans_iter_init(trans, &iter, id, old_pos,
++			     BTREE_ITER_NOT_EXTENTS|
++			     BTREE_ITER_ALL_SNAPSHOTS);
++	while (1) {
++		k = bch2_btree_iter_prev(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			break;
++
++		if (bkey_cmp(old_pos, k.k->p))
++			break;
++
++		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
++			struct bkey_i *update;
++
++			if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot))
++				continue;
++
++			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
++
++			ret = PTR_ERR_OR_ZERO(update);
++			if (ret)
++				break;
++
++			bkey_init(&update->k);
++			update->k.p = new_pos;
++			update->k.p.snapshot = k.k->p.snapshot;
++
++			bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
++					     BTREE_ITER_NOT_EXTENTS|
++					     BTREE_ITER_ALL_SNAPSHOTS|
++					     BTREE_ITER_INTENT);
++			ret   = bch2_btree_iter_traverse(&update_iter) ?:
++				bch2_trans_update(trans, &update_iter, update,
++					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++			bch2_trans_iter_exit(trans, &update_iter);
++			if (ret)
++				break;
++
++			ret = snapshot_list_add(c, &s, k.k->p.snapshot);
++			if (ret)
++				break;
++		}
++	}
++	bch2_trans_iter_exit(trans, &iter);
++	darray_exit(&s);
++
++	return ret;
++}
++
++static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
++{
++	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
++	struct bch_extent_ptr *ptr;
++
++	bkey_for_each_ptr(ptrs, ptr)
++		if (ptr->dev == dev)
++			ptr->cached = true;
++}
++
++static int bch2_data_update_index_update(struct bch_write_op *op)
++{
++	struct bch_fs *c = op->c;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct data_update *m =
++		container_of(op, struct data_update, op);
++	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
++	struct keylist *keys = &op->insert_keys;
++	struct bkey_buf _new, _insert;
++	int ret = 0;
++
++	bch2_bkey_buf_init(&_new);
++	bch2_bkey_buf_init(&_insert);
++	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
++
++	bch2_trans_iter_init(&trans, &iter, m->btree_id,
++			     bkey_start_pos(&bch2_keylist_front(keys)->k),
++			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
++
++	while (1) {
++		struct bkey_s_c k;
++		struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
++		struct bkey_i *insert;
++		struct bkey_i_extent *new;
++		const union bch_extent_entry *entry;
++		struct extent_ptr_decoded p;
++		struct bpos next_pos;
++		bool did_work = false;
++		bool should_check_enospc;
++		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
++		unsigned i;
++
++		bch2_trans_begin(&trans);
++
++		k = bch2_btree_iter_peek_slot(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		new = bkey_i_to_extent(bch2_keylist_front(keys));
++
++		if (!bch2_extents_match(k, old))
++			goto nomatch;
++
++		bkey_reassemble(_insert.k, k);
++		insert = _insert.k;
++
++		bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
++		new = bkey_i_to_extent(_new.k);
++		bch2_cut_front(iter.pos, &new->k_i);
++
++		bch2_cut_front(iter.pos,	insert);
++		bch2_cut_back(new->k.p,		insert);
++		bch2_cut_back(insert->k.p,	&new->k_i);
++
++		/*
++		 * @old: extent that we read from
++		 * @insert: key that we're going to update, initialized from
++		 * extent currently in btree - same as @old unless we raced with
++		 * other updates
++		 * @new: extent with new pointers that we'll be adding to @insert
++		 *
++		 * Fist, drop rewrite_ptrs from @new:
++		 */
++		i = 0;
++		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
++			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
++			    bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) {
++				/*
++				 * If we're going to be adding a pointer to the
++				 * same device, we have to drop the old one -
++				 * otherwise, we can just mark it cached:
++				 */
++				if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
++					bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev);
++				else
++					bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev);
++			}
++			i++;
++		}
++
++
++		/* Add new ptrs: */
++		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
++			if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
++				/*
++				 * raced with another move op? extent already
++				 * has a pointer to the device we just wrote
++				 * data to
++				 */
++				continue;
++			}
++
++			bch2_extent_ptr_decoded_append(insert, &p);
++			did_work = true;
++		}
++
++		if (!did_work)
++			goto nomatch;
++
++		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
++		bch2_extent_normalize(c, bkey_i_to_s(insert));
++
++		ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
++						 &should_check_enospc,
++						 &i_sectors_delta,
++						 &disk_sectors_delta);
++		if (ret)
++			goto err;
++
++		if (disk_sectors_delta > (s64) op->res.sectors) {
++			ret = bch2_disk_reservation_add(c, &op->res,
++						disk_sectors_delta - op->res.sectors,
++						!should_check_enospc
++						? BCH_DISK_RESERVATION_NOFAIL : 0);
++			if (ret)
++				goto out;
++		}
++
++		next_pos = insert->k.p;
++
++		ret   = insert_snapshot_whiteouts(&trans, m->btree_id,
++						  k.k->p, insert->k.p) ?:
++			bch2_trans_update(&trans, &iter, insert,
++				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
++			bch2_trans_commit(&trans, &op->res,
++				op_journal_seq(op),
++				BTREE_INSERT_NOFAIL|
++				m->data_opts.btree_insert_flags);
++		if (!ret) {
++			bch2_btree_iter_set_pos(&iter, next_pos);
++			atomic_long_inc(&c->extent_migrate_done);
++			if (ec_ob)
++				bch2_ob_add_backpointer(c, ec_ob, &insert->k);
++		}
++err:
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			ret = 0;
++		if (ret)
++			break;
++next:
++		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
++			bch2_keylist_pop_front(keys);
++			if (bch2_keylist_empty(keys))
++				goto out;
++		}
++		continue;
++nomatch:
++		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
++			struct printbuf buf = PRINTBUF;
++
++			bch2_bkey_val_to_text(&buf, c, old);
++			bch_info(c, "no match for %s", buf.buf);
++			printbuf_exit(&buf);
++		}
++
++		if (m->ctxt) {
++			BUG_ON(k.k->p.offset <= iter.pos.offset);
++			atomic64_inc(&m->ctxt->stats->keys_raced);
++			atomic64_add(k.k->p.offset - iter.pos.offset,
++				     &m->ctxt->stats->sectors_raced);
++		}
++		atomic_long_inc(&c->extent_migrate_raced);
++		trace_move_race(&new->k);
++		bch2_btree_iter_advance(&iter);
++		goto next;
++	}
++out:
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&_insert, c);
++	bch2_bkey_buf_exit(&_new, c);
++	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
++	return ret;
++}
++
++void bch2_data_update_read_done(struct data_update *m,
++				struct bch_extent_crc_unpacked crc,
++				struct closure *cl)
++{
++	/* write bio must own pages: */
++	BUG_ON(!m->op.wbio.bio.bi_vcnt);
++
++	m->op.crc = crc;
++	m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
++
++	closure_call(&m->op.cl, bch2_write, NULL, cl);
++}
++
++void bch2_data_update_exit(struct data_update *update)
++{
++	struct bch_fs *c = update->op.c;
++
++	bch2_bkey_buf_exit(&update->k, c);
++	bch2_disk_reservation_put(c, &update->op.res);
++	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
++}
++
++int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
++			  struct write_point_specifier wp,
++			  struct bch_io_opts io_opts,
++			  struct data_update_opts data_opts,
++			  enum btree_id btree_id,
++			  struct bkey_s_c k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
++	int ret;
++
++	bch2_bkey_buf_init(&m->k);
++	bch2_bkey_buf_reassemble(&m->k, c, k);
++	m->btree_id	= btree_id;
++	m->data_opts	= data_opts;
++
++	bch2_write_op_init(&m->op, c, io_opts);
++	m->op.pos	= bkey_start_pos(k.k);
++	m->op.version	= k.k->version;
++	m->op.target	= data_opts.target,
++	m->op.write_point = wp;
++	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
++		BCH_WRITE_PAGES_OWNED|
++		BCH_WRITE_DATA_ENCODED|
++		BCH_WRITE_FROM_INTERNAL|
++		m->data_opts.write_flags;
++	m->op.compression_type =
++		bch2_compression_opt_to_type[io_opts.background_compression ?:
++					     io_opts.compression];
++	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
++		m->op.alloc_reserve = RESERVE_movinggc;
++	m->op.index_update_fn	= bch2_data_update_index_update;
++
++	i = 0;
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++		if (p.ptr.cached)
++			m->data_opts.rewrite_ptrs &= ~(1U << i);
++
++		if (!((1U << i) & m->data_opts.rewrite_ptrs))
++			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
++
++		if (((1U << i) & m->data_opts.rewrite_ptrs) &&
++		    crc_is_compressed(p.crc))
++			reserve_sectors += k.k->size;
++
++		/*
++		 * op->csum_type is normally initialized from the fs/file's
++		 * current options - but if an extent is encrypted, we require
++		 * that it stays encrypted:
++		 */
++		if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
++			m->op.nonce	= p.crc.nonce + p.crc.offset;
++			m->op.csum_type = p.crc.csum_type;
++		}
++
++		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
++			m->op.incompressible = true;
++
++		i++;
++	}
++
++	if (reserve_sectors) {
++		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
++				m->data_opts.extra_replicas
++				? 0
++				: BCH_DISK_RESERVATION_NOFAIL);
++		if (ret)
++			return ret;
++	}
++
++	m->op.nr_replicas = m->op.nr_replicas_required =
++		hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
++	return 0;
++}
+diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
+new file mode 100644
+index 000000000000..e64505453a55
+--- /dev/null
++++ b/fs/bcachefs/data_update.h
+@@ -0,0 +1,38 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _BCACHEFS_DATA_UPDATE_H
++#define _BCACHEFS_DATA_UPDATE_H
++
++#include "bkey_buf.h"
++#include "io_types.h"
++
++struct moving_context;
++
++struct data_update_opts {
++	unsigned	rewrite_ptrs;
++	u16		target;
++	u8		extra_replicas;
++	unsigned	btree_insert_flags;
++	unsigned	write_flags;
++};
++
++struct data_update {
++	/* extent being updated: */
++	enum btree_id		btree_id;
++	struct bkey_buf		k;
++	struct data_update_opts	data_opts;
++	struct moving_context	*ctxt;
++	struct bch_write_op	op;
++};
++
++void bch2_data_update_read_done(struct data_update *,
++				struct bch_extent_crc_unpacked,
++				struct closure *);
++
++void bch2_data_update_exit(struct data_update *);
++int bch2_data_update_init(struct bch_fs *, struct data_update *,
++			  struct write_point_specifier,
++			  struct bch_io_opts, struct data_update_opts,
++			  enum btree_id, struct bkey_s_c);
++
++#endif /* _BCACHEFS_DATA_UPDATE_H */
+diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
+new file mode 100644
+index 000000000000..cd37a1016e25
+--- /dev/null
++++ b/fs/bcachefs/debug.c
+@@ -0,0 +1,764 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Assorted bcachefs debug code
++ *
++ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
++ * Copyright 2012 Google, Inc.
++ */
++
++#include "bcachefs.h"
++#include "bkey_methods.h"
++#include "btree_cache.h"
++#include "btree_io.h"
++#include "btree_iter.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "debug.h"
++#include "error.h"
++#include "extents.h"
++#include "fsck.h"
++#include "inode.h"
++#include "io.h"
++#include "super.h"
++
++#include <linux/console.h>
++#include <linux/debugfs.h>
++#include <linux/module.h>
++#include <linux/pretty-printers.h>
++#include <linux/random.h>
++#include <linux/seq_file.h>
++
++static struct dentry *bch_debug;
++
++static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
++				      struct extent_ptr_decoded pick)
++{
++	struct btree *v = c->verify_data;
++	struct btree_node *n_ondisk = c->verify_ondisk;
++	struct btree_node *n_sorted = c->verify_data->data;
++	struct bset *sorted, *inmemory = &b->data->keys;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
++	struct bio *bio;
++	bool failed = false;
++
++	if (!bch2_dev_get_ioref(ca, READ))
++		return false;
++
++	bio = bio_alloc_bioset(ca->disk_sb.bdev,
++			       buf_pages(n_sorted, btree_bytes(c)),
++			       REQ_OP_READ|REQ_META,
++			       GFP_NOIO,
++			       &c->btree_bio);
++	bio->bi_iter.bi_sector	= pick.ptr.offset;
++	bch2_bio_map(bio, n_sorted, btree_bytes(c));
++
++	submit_bio_wait(bio);
++
++	bio_put(bio);
++	percpu_ref_put(&ca->io_ref);
++
++	memcpy(n_ondisk, n_sorted, btree_bytes(c));
++
++	v->written = 0;
++	if (bch2_btree_node_read_done(c, ca, v, false))
++		return false;
++
++	n_sorted = c->verify_data->data;
++	sorted = &n_sorted->keys;
++
++	if (inmemory->u64s != sorted->u64s ||
++	    memcmp(inmemory->start,
++		   sorted->start,
++		   vstruct_end(inmemory) - (void *) inmemory->start)) {
++		unsigned offset = 0, sectors;
++		struct bset *i;
++		unsigned j;
++
++		console_lock();
++
++		printk(KERN_ERR "*** in memory:\n");
++		bch2_dump_bset(c, b, inmemory, 0);
++
++		printk(KERN_ERR "*** read back in:\n");
++		bch2_dump_bset(c, v, sorted, 0);
++
++		while (offset < v->written) {
++			if (!offset) {
++				i = &n_ondisk->keys;
++				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
++					c->block_bits;
++			} else {
++				struct btree_node_entry *bne =
++					(void *) n_ondisk + (offset << 9);
++				i = &bne->keys;
++
++				sectors = vstruct_blocks(bne, c->block_bits) <<
++					c->block_bits;
++			}
++
++			printk(KERN_ERR "*** on disk block %u:\n", offset);
++			bch2_dump_bset(c, b, i, offset);
++
++			offset += sectors;
++		}
++
++		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
++			if (inmemory->_data[j] != sorted->_data[j])
++				break;
++
++		console_unlock();
++		bch_err(c, "verify failed at key %u", j);
++
++		failed = true;
++	}
++
++	if (v->written != b->written) {
++		bch_err(c, "written wrong: expected %u, got %u",
++			b->written, v->written);
++		failed = true;
++	}
++
++	return failed;
++}
++
++void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
++{
++	struct bkey_ptrs_c ptrs;
++	struct extent_ptr_decoded p;
++	const union bch_extent_entry *entry;
++	struct btree *v;
++	struct bset *inmemory = &b->data->keys;
++	struct bkey_packed *k;
++	bool failed = false;
++
++	if (c->opts.nochanges)
++		return;
++
++	bch2_btree_node_io_lock(b);
++	mutex_lock(&c->verify_lock);
++
++	if (!c->verify_ondisk) {
++		c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
++		if (!c->verify_ondisk)
++			goto out;
++	}
++
++	if (!c->verify_data) {
++		c->verify_data = __bch2_btree_node_mem_alloc(c);
++		if (!c->verify_data)
++			goto out;
++
++		list_del_init(&c->verify_data->list);
++	}
++
++	BUG_ON(b->nsets != 1);
++
++	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k))
++		if (k->type == KEY_TYPE_btree_ptr_v2) {
++			struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
++			v->mem_ptr = 0;
++		}
++
++	v = c->verify_data;
++	bkey_copy(&v->key, &b->key);
++	v->c.level	= b->c.level;
++	v->c.btree_id	= b->c.btree_id;
++	bch2_btree_keys_init(v);
++
++	ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
++	bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
++		failed |= bch2_btree_verify_replica(c, b, p);
++
++	if (failed) {
++		struct printbuf buf = PRINTBUF;
++
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
++		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
++		printbuf_exit(&buf);
++	}
++out:
++	mutex_unlock(&c->verify_lock);
++	bch2_btree_node_io_unlock(b);
++}
++
++#ifdef CONFIG_DEBUG_FS
++
++/* XXX: bch_fs refcounting */
++
++struct dump_iter {
++	struct bch_fs		*c;
++	enum btree_id		id;
++	struct bpos		from;
++	struct bpos		prev_node;
++	u64			iter;
++
++	struct printbuf		buf;
++
++	char __user		*ubuf;	/* destination user buffer */
++	size_t			size;	/* size of requested read */
++	ssize_t			ret;	/* bytes read so far */
++};
++
++static int flush_buf(struct dump_iter *i)
++{
++	if (i->buf.pos) {
++		size_t bytes = min_t(size_t, i->buf.pos, i->size);
++		int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
++
++		if (err)
++			return err;
++
++		i->ret	 += bytes;
++		i->ubuf	 += bytes;
++		i->size	 -= bytes;
++		i->buf.pos -= bytes;
++		memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
++	}
++
++	return 0;
++}
++
++static int bch2_dump_open(struct inode *inode, struct file *file)
++{
++	struct btree_debug *bd = inode->i_private;
++	struct dump_iter *i;
++
++	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
++	if (!i)
++		return -ENOMEM;
++
++	file->private_data = i;
++	i->from = POS_MIN;
++	i->iter	= 0;
++	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
++	i->id	= bd->id;
++	i->buf	= PRINTBUF;
++
++	return 0;
++}
++
++static int bch2_dump_release(struct inode *inode, struct file *file)
++{
++	struct dump_iter *i = file->private_data;
++
++	printbuf_exit(&i->buf);
++	kfree(i);
++	return 0;
++}
++
++static ssize_t bch2_read_btree(struct file *file, char __user *buf,
++			       size_t size, loff_t *ppos)
++{
++	struct dump_iter *i = file->private_data;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int err;
++
++	i->ubuf = buf;
++	i->size	= size;
++	i->ret	= 0;
++
++	bch2_trans_init(&trans, i->c, 0, 0);
++
++	err = for_each_btree_key2(&trans, iter, i->id, i->from,
++				  BTREE_ITER_PREFETCH|
++				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
++		err = flush_buf(i);
++		if (err)
++			break;
++
++		if (!i->size)
++			break;
++
++		bch2_bkey_val_to_text(&i->buf, i->c, k);
++		prt_newline(&i->buf);
++		0;
++	}));
++	i->from = iter.pos;
++
++	if (!err)
++		err = flush_buf(i);
++
++	bch2_trans_exit(&trans);
++
++	return err ?: i->ret;
++}
++
++static const struct file_operations btree_debug_ops = {
++	.owner		= THIS_MODULE,
++	.open		= bch2_dump_open,
++	.release	= bch2_dump_release,
++	.read		= bch2_read_btree,
++};
++
++static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
++				       size_t size, loff_t *ppos)
++{
++	struct dump_iter *i = file->private_data;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct btree *b;
++	int err;
++
++	i->ubuf = buf;
++	i->size	= size;
++	i->ret	= 0;
++
++	err = flush_buf(i);
++	if (err)
++		return err;
++
++	if (!i->size || !bpos_cmp(SPOS_MAX, i->from))
++		return i->ret;
++
++	bch2_trans_init(&trans, i->c, 0, 0);
++
++	for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
++		bch2_btree_node_to_text(&i->buf, i->c, b);
++		err = flush_buf(i);
++		if (err)
++			break;
++
++		/*
++		 * can't easily correctly restart a btree node traversal across
++		 * all nodes, meh
++		 */
++		i->from = bpos_cmp(SPOS_MAX, b->key.k.p)
++			? bpos_successor(b->key.k.p)
++			: b->key.k.p;
++
++		if (!i->size)
++			break;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++
++	return err < 0 ? err : i->ret;
++}
++
++static const struct file_operations btree_format_debug_ops = {
++	.owner		= THIS_MODULE,
++	.open		= bch2_dump_open,
++	.release	= bch2_dump_release,
++	.read		= bch2_read_btree_formats,
++};
++
++static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
++				       size_t size, loff_t *ppos)
++{
++	struct dump_iter *i = file->private_data;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int err;
++
++	i->ubuf = buf;
++	i->size	= size;
++	i->ret	= 0;
++
++	err = flush_buf(i);
++	if (err)
++		return err;
++
++	if (!i->size)
++		return i->ret;
++
++	bch2_trans_init(&trans, i->c, 0, 0);
++
++	err = for_each_btree_key2(&trans, iter, i->id, i->from,
++				  BTREE_ITER_PREFETCH|
++				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
++		struct btree_path_level *l = &iter.path->l[0];
++		struct bkey_packed *_k =
++			bch2_btree_node_iter_peek(&l->iter, l->b);
++
++		err = flush_buf(i);
++		if (err)
++			break;
++
++		if (!i->size)
++			break;
++
++		if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) {
++			bch2_btree_node_to_text(&i->buf, i->c, l->b);
++			i->prev_node = l->b->key.k.p;
++		}
++
++		bch2_bfloat_to_text(&i->buf, l->b, _k);
++		0;
++	}));
++	i->from = iter.pos;
++
++	if (!err)
++		err = flush_buf(i);
++
++	bch2_trans_exit(&trans);
++
++	return err ?: i->ret;
++}
++
++static const struct file_operations bfloat_failed_debug_ops = {
++	.owner		= THIS_MODULE,
++	.open		= bch2_dump_open,
++	.release	= bch2_dump_release,
++	.read		= bch2_read_bfloat_failed,
++};
++
++static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
++					   struct btree *b)
++{
++	out->tabstops[0] = 32;
++
++	prt_printf(out, "%px btree=%s l=%u ",
++	       b,
++	       bch2_btree_ids[b->c.btree_id],
++	       b->c.level);
++	prt_newline(out);
++
++	printbuf_indent_add(out, 2);
++
++	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
++	prt_newline(out);
++
++	prt_printf(out, "flags: ");
++	prt_tab(out);
++	prt_bitflags(out, bch2_btree_node_flags, b->flags);
++	prt_newline(out);
++
++	prt_printf(out, "pcpu read locks: ");
++	prt_tab(out);
++	prt_printf(out, "%u", b->c.lock.readers != NULL);
++	prt_newline(out);
++
++	prt_printf(out, "written:");
++	prt_tab(out);
++	prt_printf(out, "%u", b->written);
++	prt_newline(out);
++
++	prt_printf(out, "writes blocked:");
++	prt_tab(out);
++	prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
++	prt_newline(out);
++
++	prt_printf(out, "will make reachable:");
++	prt_tab(out);
++	prt_printf(out, "%lx", b->will_make_reachable);
++	prt_newline(out);
++
++	prt_printf(out, "journal pin %px:", &b->writes[0].journal);
++	prt_tab(out);
++	prt_printf(out, "%llu", b->writes[0].journal.seq);
++	prt_newline(out);
++
++	prt_printf(out, "journal pin %px:", &b->writes[1].journal);
++	prt_tab(out);
++	prt_printf(out, "%llu", b->writes[1].journal.seq);
++	prt_newline(out);
++
++	printbuf_indent_sub(out, 2);
++}
++
++static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
++					    size_t size, loff_t *ppos)
++{
++	struct dump_iter *i = file->private_data;
++	struct bch_fs *c = i->c;
++	bool done = false;
++	int err;
++
++	i->ubuf = buf;
++	i->size	= size;
++	i->ret	= 0;
++
++	do {
++		struct bucket_table *tbl;
++		struct rhash_head *pos;
++		struct btree *b;
++
++		err = flush_buf(i);
++		if (err)
++			return err;
++
++		if (!i->size)
++			break;
++
++		rcu_read_lock();
++		i->buf.atomic++;
++		tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
++					  &c->btree_cache.table);
++		if (i->iter < tbl->size) {
++			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
++				bch2_cached_btree_node_to_text(&i->buf, c, b);
++			i->iter++;;
++		} else {
++			done = true;
++		}
++		--i->buf.atomic;
++		rcu_read_unlock();
++	} while (!done);
++
++	if (i->buf.allocation_failure)
++		return -ENOMEM;
++
++	return i->ret;
++}
++
++static const struct file_operations cached_btree_nodes_ops = {
++	.owner		= THIS_MODULE,
++	.open		= bch2_dump_open,
++	.release	= bch2_dump_release,
++	.read		= bch2_cached_btree_nodes_read,
++};
++
++static int prt_backtrace(struct printbuf *out, struct task_struct *task)
++{
++	unsigned long entries[32];
++	unsigned i, nr_entries;
++	int ret;
++
++	ret = down_read_killable(&task->signal->exec_update_lock);
++	if (ret)
++		return ret;
++
++	nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
++	for (i = 0; i < nr_entries; i++) {
++		prt_printf(out, "[<0>] %pB", (void *)entries[i]);
++		prt_newline(out);
++	}
++
++	up_read(&task->signal->exec_update_lock);
++	return 0;
++}
++
++static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
++					    size_t size, loff_t *ppos)
++{
++	struct dump_iter *i = file->private_data;
++	struct bch_fs *c = i->c;
++	struct btree_trans *trans;
++	int err;
++
++	i->ubuf = buf;
++	i->size	= size;
++	i->ret	= 0;
++
++	mutex_lock(&c->btree_trans_lock);
++	list_for_each_entry(trans, &c->btree_trans_list, list) {
++		if (trans->task->pid <= i->iter)
++			continue;
++
++		err = flush_buf(i);
++		if (err)
++			return err;
++
++		if (!i->size)
++			break;
++
++		bch2_btree_trans_to_text(&i->buf, trans);
++
++		prt_printf(&i->buf, "backtrace:");
++		prt_newline(&i->buf);
++		printbuf_indent_add(&i->buf, 2);
++		prt_backtrace(&i->buf, trans->task);
++		printbuf_indent_sub(&i->buf, 2);
++		prt_newline(&i->buf);
++
++		i->iter = trans->task->pid;
++	}
++	mutex_unlock(&c->btree_trans_lock);
++
++	if (i->buf.allocation_failure)
++		return -ENOMEM;
++
++	return i->ret;
++}
++
++static const struct file_operations btree_transactions_ops = {
++	.owner		= THIS_MODULE,
++	.open		= bch2_dump_open,
++	.release	= bch2_dump_release,
++	.read		= bch2_btree_transactions_read,
++};
++
++static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
++				      size_t size, loff_t *ppos)
++{
++	struct dump_iter *i = file->private_data;
++	struct bch_fs *c = i->c;
++	bool done = false;
++	int err;
++
++	i->ubuf = buf;
++	i->size	= size;
++	i->ret	= 0;
++
++	do {
++		err = flush_buf(i);
++		if (err)
++			return err;
++
++		if (!i->size)
++			break;
++
++		done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
++		i->iter++;
++	} while (!done);
++
++	if (i->buf.allocation_failure)
++		return -ENOMEM;
++
++	return i->ret;
++}
++
++static const struct file_operations journal_pins_ops = {
++	.owner		= THIS_MODULE,
++	.open		= bch2_dump_open,
++	.release	= bch2_dump_release,
++	.read		= bch2_journal_pins_read,
++};
++
++static int lock_held_stats_open(struct inode *inode, struct file *file)
++{
++	struct bch_fs *c = inode->i_private;
++	struct dump_iter *i;
++
++	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
++
++	if (!i)
++		return -ENOMEM;
++
++	i->iter = 0;
++	i->c    = c;
++	i->buf  = PRINTBUF;
++	file->private_data = i;
++
++	return 0;
++}
++
++static int lock_held_stats_release(struct inode *inode, struct file *file)
++{
++	struct dump_iter *i = file->private_data;
++
++	printbuf_exit(&i->buf);
++	kfree(i);
++
++	return 0;
++}
++
++static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
++				      size_t size, loff_t *ppos)
++{
++	struct dump_iter        *i = file->private_data;
++	struct lock_held_stats *lhs = &i->c->lock_held_stats;
++	int err;
++
++	i->ubuf = buf;
++	i->size = size;
++	i->ret  = 0;
++
++	while (lhs->names[i->iter] != 0 && i->iter < BCH_LOCK_TIME_NR) {
++		err = flush_buf(i);
++		if (err)
++			return err;
++
++		if (!i->size)
++			break;
++
++		prt_printf(&i->buf, "%s:", lhs->names[i->iter]);
++		prt_newline(&i->buf);
++		printbuf_indent_add(&i->buf, 8);
++		bch2_time_stats_to_text(&i->buf, &lhs->times[i->iter]);
++		printbuf_indent_sub(&i->buf, 8);
++		prt_newline(&i->buf);
++		i->iter++;
++	}
++
++	if (i->buf.allocation_failure)
++		return -ENOMEM;
++
++	return i->ret;
++}
++
++static const struct file_operations lock_held_stats_op = {
++	.owner = THIS_MODULE,
++	.open = lock_held_stats_open,
++	.release = lock_held_stats_release,
++	.read = lock_held_stats_read,
++};
++
++void bch2_fs_debug_exit(struct bch_fs *c)
++{
++	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
++		debugfs_remove_recursive(c->fs_debug_dir);
++}
++
++void bch2_fs_debug_init(struct bch_fs *c)
++{
++	struct btree_debug *bd;
++	char name[100];
++
++	if (IS_ERR_OR_NULL(bch_debug))
++		return;
++
++	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
++	c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
++	if (IS_ERR_OR_NULL(c->fs_debug_dir))
++		return;
++
++	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
++			    c->btree_debug, &cached_btree_nodes_ops);
++
++	debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
++			    c->btree_debug, &btree_transactions_ops);
++
++	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
++			    c->btree_debug, &journal_pins_ops);
++
++	if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
++		debugfs_create_file("lock_held_stats", 0400, c->fs_debug_dir,
++				c, &lock_held_stats_op);
++	}
++
++	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
++	if (IS_ERR_OR_NULL(c->btree_debug_dir))
++		return;
++
++	for (bd = c->btree_debug;
++	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
++	     bd++) {
++		bd->id = bd - c->btree_debug;
++		debugfs_create_file(bch2_btree_ids[bd->id],
++				    0400, c->btree_debug_dir, bd,
++				    &btree_debug_ops);
++
++		snprintf(name, sizeof(name), "%s-formats",
++			 bch2_btree_ids[bd->id]);
++
++		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
++				    &btree_format_debug_ops);
++
++		snprintf(name, sizeof(name), "%s-bfloat-failed",
++			 bch2_btree_ids[bd->id]);
++
++		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
++				    &bfloat_failed_debug_ops);
++	}
++}
++
++#endif
++
++void bch2_debug_exit(void)
++{
++	if (!IS_ERR_OR_NULL(bch_debug))
++		debugfs_remove_recursive(bch_debug);
++}
++
++int __init bch2_debug_init(void)
++{
++	int ret = 0;
++
++	bch_debug = debugfs_create_dir("bcachefs", NULL);
++	return ret;
++}
+diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
+new file mode 100644
+index 000000000000..0b86736e5e1b
+--- /dev/null
++++ b/fs/bcachefs/debug.h
+@@ -0,0 +1,30 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_DEBUG_H
++#define _BCACHEFS_DEBUG_H
++
++#include "bcachefs.h"
++
++struct bio;
++struct btree;
++struct bch_fs;
++
++void __bch2_btree_verify(struct bch_fs *, struct btree *);
++
++static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
++{
++	if (bch2_verify_btree_ondisk)
++		__bch2_btree_verify(c, b);
++}
++
++#ifdef CONFIG_DEBUG_FS
++void bch2_fs_debug_exit(struct bch_fs *);
++void bch2_fs_debug_init(struct bch_fs *);
++#else
++static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
++static inline void bch2_fs_debug_init(struct bch_fs *c) {}
++#endif
++
++void bch2_debug_exit(void);
++int bch2_debug_init(void);
++
++#endif /* _BCACHEFS_DEBUG_H */
+diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
+new file mode 100644
+index 000000000000..4d942d224a08
+--- /dev/null
++++ b/fs/bcachefs/dirent.c
+@@ -0,0 +1,565 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "bkey_methods.h"
++#include "btree_update.h"
++#include "extents.h"
++#include "dirent.h"
++#include "fs.h"
++#include "keylist.h"
++#include "str_hash.h"
++#include "subvolume.h"
++
++#include <linux/dcache.h>
++
++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
++{
++	unsigned len = bkey_val_bytes(d.k) -
++		offsetof(struct bch_dirent, d_name);
++
++	return strnlen(d.v->d_name, len);
++}
++
++static u64 bch2_dirent_hash(const struct bch_hash_info *info,
++			    const struct qstr *name)
++{
++	struct bch_str_hash_ctx ctx;
++
++	bch2_str_hash_init(&ctx, info);
++	bch2_str_hash_update(&ctx, info, name->name, name->len);
++
++	/* [0,2) reserved for dots */
++	return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
++}
++
++static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
++{
++	return bch2_dirent_hash(info, key);
++}
++
++static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
++{
++	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
++	struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
++
++	return bch2_dirent_hash(info, &name);
++}
++
++static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
++{
++	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
++	int len = bch2_dirent_name_bytes(l);
++	const struct qstr *r = _r;
++
++	return len - r->len ?: memcmp(l.v->d_name, r->name, len);
++}
++
++static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
++{
++	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
++	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
++	int l_len = bch2_dirent_name_bytes(l);
++	int r_len = bch2_dirent_name_bytes(r);
++
++	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
++}
++
++static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
++{
++	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
++
++	if (d.v->d_type == DT_SUBVOL)
++		return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
++	return true;
++}
++
++const struct bch_hash_desc bch2_dirent_hash_desc = {
++	.btree_id	= BTREE_ID_dirents,
++	.key_type	= KEY_TYPE_dirent,
++	.hash_key	= dirent_hash_key,
++	.hash_bkey	= dirent_hash_bkey,
++	.cmp_key	= dirent_cmp_key,
++	.cmp_bkey	= dirent_cmp_bkey,
++	.is_visible	= dirent_is_visible,
++};
++
++int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			int rw, struct printbuf *err)
++{
++	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
++	unsigned len;
++
++	if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) {
++		prt_printf(err, "incorrect value size (%zu < %zu)",
++		       bkey_val_bytes(k.k), sizeof(*d.v));
++		return -EINVAL;
++	}
++
++	len = bch2_dirent_name_bytes(d);
++	if (!len) {
++		prt_printf(err, "empty name");
++		return -EINVAL;
++	}
++
++	if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
++		prt_printf(err, "value too big (%zu > %u)",
++		       bkey_val_u64s(k.k),dirent_val_u64s(len));
++		return -EINVAL;
++	}
++
++	if (len > BCH_NAME_MAX) {
++		prt_printf(err, "dirent name too big (%u > %u)",
++		       len, BCH_NAME_MAX);
++		return -EINVAL;
++	}
++
++	if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
++		prt_printf(err, "invalid name");
++		return -EINVAL;
++	}
++
++	if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
++		prt_printf(err, "invalid name");
++		return -EINVAL;
++	}
++
++	if (memchr(d.v->d_name, '/', len)) {
++		prt_printf(err, "invalid name");
++		return -EINVAL;
++	}
++
++	if (d.v->d_type != DT_SUBVOL &&
++	    le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
++		prt_printf(err, "dirent points to own directory");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
++			 struct bkey_s_c k)
++{
++	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
++
++	prt_printf(out, "%.*s -> %llu type %s",
++	       bch2_dirent_name_bytes(d),
++	       d.v->d_name,
++	       d.v->d_type != DT_SUBVOL
++	       ? le64_to_cpu(d.v->d_inum)
++	       : le32_to_cpu(d.v->d_child_subvol),
++	       bch2_d_type_str(d.v->d_type));
++}
++
++static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
++				subvol_inum dir, u8 type,
++				const struct qstr *name, u64 dst)
++{
++	struct bkey_i_dirent *dirent;
++	unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
++
++	if (name->len > BCH_NAME_MAX)
++		return ERR_PTR(-ENAMETOOLONG);
++
++	BUG_ON(u64s > U8_MAX);
++
++	dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
++	if (IS_ERR(dirent))
++		return dirent;
++
++	bkey_dirent_init(&dirent->k_i);
++	dirent->k.u64s = u64s;
++
++	if (type != DT_SUBVOL) {
++		dirent->v.d_inum = cpu_to_le64(dst);
++	} else {
++		dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
++		dirent->v.d_child_subvol = cpu_to_le32(dst);
++	}
++
++	dirent->v.d_type = type;
++
++	memcpy(dirent->v.d_name, name->name, name->len);
++	memset(dirent->v.d_name + name->len, 0,
++	       bkey_val_bytes(&dirent->k) -
++	       offsetof(struct bch_dirent, d_name) -
++	       name->len);
++
++	EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
++
++	return dirent;
++}
++
++int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
++		       const struct bch_hash_info *hash_info,
++		       u8 type, const struct qstr *name, u64 dst_inum,
++		       u64 *dir_offset, int flags)
++{
++	struct bkey_i_dirent *dirent;
++	int ret;
++
++	dirent = dirent_create_key(trans, dir, type, name, dst_inum);
++	ret = PTR_ERR_OR_ZERO(dirent);
++	if (ret)
++		return ret;
++
++	ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
++			    dir, &dirent->k_i, flags);
++	*dir_offset = dirent->k.p.offset;
++
++	return ret;
++}
++
++static void dirent_copy_target(struct bkey_i_dirent *dst,
++			       struct bkey_s_c_dirent src)
++{
++	dst->v.d_inum = src.v->d_inum;
++	dst->v.d_type = src.v->d_type;
++}
++
++int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
++			    struct bkey_s_c_dirent d, subvol_inum *target)
++{
++	struct bch_subvolume s;
++	int ret = 0;
++
++	if (d.v->d_type == DT_SUBVOL &&
++	    d.v->d_parent_subvol != dir.subvol)
++		return 1;
++
++	if (likely(d.v->d_type != DT_SUBVOL)) {
++		target->subvol	= dir.subvol;
++		target->inum	= le64_to_cpu(d.v->d_inum);
++	} else {
++		target->subvol	= le32_to_cpu(d.v->d_child_subvol);
++
++		ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
++
++		target->inum	= le64_to_cpu(s.inode);
++	}
++
++	return ret;
++}
++
++int bch2_dirent_rename(struct btree_trans *trans,
++		subvol_inum src_dir, struct bch_hash_info *src_hash,
++		subvol_inum dst_dir, struct bch_hash_info *dst_hash,
++		const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
++		const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
++		enum bch_rename_mode mode)
++{
++	struct btree_iter src_iter = { NULL };
++	struct btree_iter dst_iter = { NULL };
++	struct bkey_s_c old_src, old_dst = bkey_s_c_null;
++	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
++	struct bpos dst_pos =
++		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
++	unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
++	int ret = 0;
++
++	if (src_dir.subvol != dst_dir.subvol)
++		return -EXDEV;
++
++	memset(src_inum, 0, sizeof(*src_inum));
++	memset(dst_inum, 0, sizeof(*dst_inum));
++
++	/* Lookup src: */
++	ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
++			       src_hash, src_dir, src_name,
++			       BTREE_ITER_INTENT);
++	if (ret)
++		goto out;
++
++	old_src = bch2_btree_iter_peek_slot(&src_iter);
++	ret = bkey_err(old_src);
++	if (ret)
++		goto out;
++
++	ret = bch2_dirent_read_target(trans, src_dir,
++			bkey_s_c_to_dirent(old_src), src_inum);
++	if (ret)
++		goto out;
++
++	src_type = bkey_s_c_to_dirent(old_src).v->d_type;
++
++	if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
++		return -EOPNOTSUPP;
++
++
++	/* Lookup dst: */
++	if (mode == BCH_RENAME) {
++		/*
++		 * Note that we're _not_ checking if the target already exists -
++		 * we're relying on the VFS to do that check for us for
++		 * correctness:
++		 */
++		ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
++				     dst_hash, dst_dir, dst_name);
++		if (ret)
++			goto out;
++	} else {
++		ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
++				       dst_hash, dst_dir, dst_name,
++				       BTREE_ITER_INTENT);
++		if (ret)
++			goto out;
++
++		old_dst = bch2_btree_iter_peek_slot(&dst_iter);
++		ret = bkey_err(old_dst);
++		if (ret)
++			goto out;
++
++		ret = bch2_dirent_read_target(trans, dst_dir,
++				bkey_s_c_to_dirent(old_dst), dst_inum);
++		if (ret)
++			goto out;
++
++		dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
++
++		if (dst_type == DT_SUBVOL)
++			return -EOPNOTSUPP;
++	}
++
++	if (mode != BCH_RENAME_EXCHANGE)
++		*src_offset = dst_iter.pos.offset;
++
++	/* Create new dst key: */
++	new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
++	ret = PTR_ERR_OR_ZERO(new_dst);
++	if (ret)
++		goto out;
++
++	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
++	new_dst->k.p = dst_iter.pos;
++
++	/* Create new src key: */
++	if (mode == BCH_RENAME_EXCHANGE) {
++		new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
++		ret = PTR_ERR_OR_ZERO(new_src);
++		if (ret)
++			goto out;
++
++		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
++		new_src->k.p = src_iter.pos;
++	} else {
++		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
++		ret = PTR_ERR_OR_ZERO(new_src);
++		if (ret)
++			goto out;
++
++		bkey_init(&new_src->k);
++		new_src->k.p = src_iter.pos;
++
++		if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
++		    bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
++			/*
++			 * We have a hash collision for the new dst key,
++			 * and new_src - the key we're deleting - is between
++			 * new_dst's hashed slot and the slot we're going to be
++			 * inserting it into - oops.  This will break the hash
++			 * table if we don't deal with it:
++			 */
++			if (mode == BCH_RENAME) {
++				/*
++				 * If we're not overwriting, we can just insert
++				 * new_dst at the src position:
++				 */
++				new_src = new_dst;
++				new_src->k.p = src_iter.pos;
++				goto out_set_src;
++			} else {
++				/* If we're overwriting, we can't insert new_dst
++				 * at a different slot because it has to
++				 * overwrite old_dst - just make sure to use a
++				 * whiteout when deleting src:
++				 */
++				new_src->k.type = KEY_TYPE_hash_whiteout;
++			}
++		} else {
++			/* Check if we need a whiteout to delete src: */
++			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
++						       src_hash, &src_iter);
++			if (ret < 0)
++				goto out;
++
++			if (ret)
++				new_src->k.type = KEY_TYPE_hash_whiteout;
++		}
++	}
++
++	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
++	if (ret)
++		goto out;
++out_set_src:
++
++	/*
++	 * If we're deleting a subvolume, we need to really delete the dirent,
++	 * not just emit a whiteout in the current snapshot:
++	 */
++	if (src_type == DT_SUBVOL) {
++		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
++		ret = bch2_btree_iter_traverse(&src_iter);
++		if (ret)
++			goto out;
++
++		new_src->k.p = src_iter.pos;
++		src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
++	}
++
++	ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
++	if (ret)
++		goto out;
++
++	if (mode == BCH_RENAME_EXCHANGE)
++		*src_offset = new_src->k.p.offset;
++	*dst_offset = new_dst->k.p.offset;
++out:
++	bch2_trans_iter_exit(trans, &src_iter);
++	bch2_trans_iter_exit(trans, &dst_iter);
++	return ret;
++}
++
++int __bch2_dirent_lookup_trans(struct btree_trans *trans,
++			       struct btree_iter *iter,
++			       subvol_inum dir,
++			       const struct bch_hash_info *hash_info,
++			       const struct qstr *name, subvol_inum *inum,
++			       unsigned flags)
++{
++	struct bkey_s_c k;
++	struct bkey_s_c_dirent d;
++	u32 snapshot;
++	int ret;
++
++	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
++	if (ret)
++		return ret;
++
++	ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
++			       hash_info, dir, name, flags);
++	if (ret)
++		return ret;
++
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	d = bkey_s_c_to_dirent(k);
++
++	ret = bch2_dirent_read_target(trans, dir, d, inum);
++	if (ret > 0)
++		ret = -ENOENT;
++err:
++	if (ret)
++		bch2_trans_iter_exit(trans, iter);
++
++	return ret;
++}
++
++u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
++		       const struct bch_hash_info *hash_info,
++		       const struct qstr *name, subvol_inum *inum)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
++					  name, inum, 0);
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++	if (!ret)
++		bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u32 snapshot;
++	int ret;
++
++	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
++	if (ret)
++		return ret;
++
++	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
++			   SPOS(dir.inum, 0, snapshot),
++			   POS(dir.inum, U64_MAX), 0, k, ret)
++		if (k.k->type == KEY_TYPE_dirent) {
++			ret = -ENOTEMPTY;
++			break;
++		}
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_s_c_dirent dirent;
++	subvol_inum target;
++	u32 snapshot;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
++			   SPOS(inum.inum, ctx->pos, snapshot),
++			   POS(inum.inum, U64_MAX), 0, k, ret) {
++		if (k.k->type != KEY_TYPE_dirent)
++			continue;
++
++		dirent = bkey_s_c_to_dirent(k);
++
++		ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
++		if (ret < 0)
++			break;
++		if (ret)
++			continue;
++
++		/*
++		 * XXX: dir_emit() can fault and block, while we're holding
++		 * locks
++		 */
++		ctx->pos = dirent.k->p.offset;
++		if (!dir_emit(ctx, dirent.v->d_name,
++			      bch2_dirent_name_bytes(dirent),
++			      target.inum,
++			      vfs_d_type(dirent.v->d_type)))
++			break;
++		ctx->pos = dirent.k->p.offset + 1;
++
++		/*
++		 * read_target looks up subvolumes, we can overflow paths if the
++		 * directory has many subvolumes in it
++		 */
++		ret = btree_trans_too_many_iters(&trans);
++		if (ret)
++			break;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	bch2_trans_exit(&trans);
++
++	return ret;
++}
+diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
+new file mode 100644
+index 000000000000..b1466932c768
+--- /dev/null
++++ b/fs/bcachefs/dirent.h
+@@ -0,0 +1,67 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_DIRENT_H
++#define _BCACHEFS_DIRENT_H
++
++#include "str_hash.h"
++
++extern const struct bch_hash_desc bch2_dirent_hash_desc;
++
++int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++
++#define bch2_bkey_ops_dirent (struct bkey_ops) {	\
++	.key_invalid	= bch2_dirent_invalid,		\
++	.val_to_text	= bch2_dirent_to_text,		\
++}
++
++struct qstr;
++struct file;
++struct dir_context;
++struct bch_fs;
++struct bch_hash_info;
++struct bch_inode_info;
++
++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
++
++static inline unsigned dirent_val_u64s(unsigned len)
++{
++	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
++			    sizeof(u64));
++}
++
++int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
++			    struct bkey_s_c_dirent, subvol_inum *);
++
++int bch2_dirent_create(struct btree_trans *, subvol_inum,
++		       const struct bch_hash_info *, u8,
++		       const struct qstr *, u64, u64 *, int);
++
++static inline unsigned vfs_d_type(unsigned type)
++{
++	return type == DT_SUBVOL ? DT_DIR : type;
++}
++
++enum bch_rename_mode {
++	BCH_RENAME,
++	BCH_RENAME_OVERWRITE,
++	BCH_RENAME_EXCHANGE,
++};
++
++int bch2_dirent_rename(struct btree_trans *,
++		       subvol_inum, struct bch_hash_info *,
++		       subvol_inum, struct bch_hash_info *,
++		       const struct qstr *, subvol_inum *, u64 *,
++		       const struct qstr *, subvol_inum *, u64 *,
++		       enum bch_rename_mode);
++
++int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
++			       subvol_inum, const struct bch_hash_info *,
++			       const struct qstr *, subvol_inum *, unsigned);
++u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
++		       const struct bch_hash_info *,
++		       const struct qstr *, subvol_inum *);
++
++int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
++int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
++
++#endif /* _BCACHEFS_DIRENT_H */
+diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
+new file mode 100644
+index 000000000000..7bd4413671d2
+--- /dev/null
++++ b/fs/bcachefs/disk_groups.c
+@@ -0,0 +1,506 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "disk_groups.h"
++#include "super-io.h"
++
++#include <linux/sort.h>
++
++static int group_cmp(const void *_l, const void *_r)
++{
++	const struct bch_disk_group *l = _l;
++	const struct bch_disk_group *r = _r;
++
++	return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
++		(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
++		((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
++		 (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
++		strncmp(l->label, r->label, sizeof(l->label));
++}
++
++static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
++					struct bch_sb_field *f,
++					struct printbuf *err)
++{
++	struct bch_sb_field_disk_groups *groups =
++		field_to_type(f, disk_groups);
++	struct bch_disk_group *g, *sorted = NULL;
++	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
++	unsigned nr_groups = disk_groups_nr(groups);
++	unsigned i, len;
++	int ret = -EINVAL;
++
++	for (i = 0; i < sb->nr_devices; i++) {
++		struct bch_member *m = mi->members + i;
++		unsigned g;
++
++		if (!BCH_MEMBER_GROUP(m))
++			continue;
++
++		g = BCH_MEMBER_GROUP(m) - 1;
++
++		if (g >= nr_groups) {
++			prt_printf(err, "disk %u has invalid label %u (have %u)",
++			       i, g, nr_groups);
++			return -EINVAL;
++		}
++
++		if (BCH_GROUP_DELETED(&groups->entries[g])) {
++			prt_printf(err, "disk %u has deleted label %u", i, g);
++			return -EINVAL;
++		}
++	}
++
++	if (!nr_groups)
++		return 0;
++
++	for (i = 0; i < nr_groups; i++) {
++		g = groups->entries + i;
++
++		if (BCH_GROUP_DELETED(g))
++			continue;
++
++		len = strnlen(g->label, sizeof(g->label));
++		if (!len) {
++			prt_printf(err, "label %u empty", i);
++			return -EINVAL;
++		}
++	}
++
++	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
++	if (!sorted)
++		return -ENOMEM;
++
++	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
++	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
++
++	for (g = sorted; g + 1 < sorted + nr_groups; g++)
++		if (!BCH_GROUP_DELETED(g) &&
++		    !group_cmp(&g[0], &g[1])) {
++			prt_printf(err, "duplicate label %llu.%.*s",
++			       BCH_GROUP_PARENT(g),
++			       (int) sizeof(g->label), g->label);
++			goto err;
++		}
++
++	ret = 0;
++err:
++	kfree(sorted);
++	return 0;
++}
++
++static void bch2_sb_disk_groups_to_text(struct printbuf *out,
++					struct bch_sb *sb,
++					struct bch_sb_field *f)
++{
++	struct bch_sb_field_disk_groups *groups =
++		field_to_type(f, disk_groups);
++	struct bch_disk_group *g;
++	unsigned nr_groups = disk_groups_nr(groups);
++
++	for (g = groups->entries;
++	     g < groups->entries + nr_groups;
++	     g++) {
++		if (g != groups->entries)
++			prt_printf(out, " ");
++
++		if (BCH_GROUP_DELETED(g))
++			prt_printf(out, "[deleted]");
++		else
++			prt_printf(out, "[parent %llu name %s]",
++			       BCH_GROUP_PARENT(g), g->label);
++	}
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
++	.validate	= bch2_sb_disk_groups_validate,
++	.to_text	= bch2_sb_disk_groups_to_text
++};
++
++int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
++{
++	struct bch_sb_field_members *mi;
++	struct bch_sb_field_disk_groups *groups;
++	struct bch_disk_groups_cpu *cpu_g, *old_g;
++	unsigned i, g, nr_groups;
++
++	lockdep_assert_held(&c->sb_lock);
++
++	mi		= bch2_sb_get_members(c->disk_sb.sb);
++	groups		= bch2_sb_get_disk_groups(c->disk_sb.sb);
++	nr_groups	= disk_groups_nr(groups);
++
++	if (!groups)
++		return 0;
++
++	cpu_g = kzalloc(sizeof(*cpu_g) +
++			sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
++	if (!cpu_g)
++		return -ENOMEM;
++
++	cpu_g->nr = nr_groups;
++
++	for (i = 0; i < nr_groups; i++) {
++		struct bch_disk_group *src	= &groups->entries[i];
++		struct bch_disk_group_cpu *dst	= &cpu_g->entries[i];
++
++		dst->deleted	= BCH_GROUP_DELETED(src);
++		dst->parent	= BCH_GROUP_PARENT(src);
++	}
++
++	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
++		struct bch_member *m = mi->members + i;
++		struct bch_disk_group_cpu *dst =
++			&cpu_g->entries[BCH_MEMBER_GROUP(m)];
++
++		if (!bch2_member_exists(m))
++			continue;
++
++		g = BCH_MEMBER_GROUP(m);
++		while (g) {
++			dst = &cpu_g->entries[g - 1];
++			__set_bit(i, dst->devs.d);
++			g = dst->parent;
++		}
++	}
++
++	old_g = rcu_dereference_protected(c->disk_groups,
++				lockdep_is_held(&c->sb_lock));
++	rcu_assign_pointer(c->disk_groups, cpu_g);
++	if (old_g)
++		kfree_rcu(old_g, rcu);
++
++	return 0;
++}
++
++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
++{
++	struct target t = target_decode(target);
++
++	switch (t.type) {
++	case TARGET_NULL:
++		return NULL;
++	case TARGET_DEV: {
++		struct bch_dev *ca = t.dev < c->sb.nr_devices
++			? rcu_dereference(c->devs[t.dev])
++			: NULL;
++		return ca ? &ca->self : NULL;
++	}
++	case TARGET_GROUP: {
++		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
++
++		return g && t.group < g->nr && !g->entries[t.group].deleted
++			? &g->entries[t.group].devs
++			: NULL;
++	}
++	default:
++		BUG();
++	}
++}
++
++bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
++{
++	struct target t = target_decode(target);
++
++	switch (t.type) {
++	case TARGET_NULL:
++		return false;
++	case TARGET_DEV:
++		return dev == t.dev;
++	case TARGET_GROUP: {
++		struct bch_disk_groups_cpu *g;
++		const struct bch_devs_mask *m;
++		bool ret;
++
++		rcu_read_lock();
++		g = rcu_dereference(c->disk_groups);
++		m = g && t.group < g->nr && !g->entries[t.group].deleted
++			? &g->entries[t.group].devs
++			: NULL;
++
++		ret = m ? test_bit(dev, m->d) : false;
++		rcu_read_unlock();
++
++		return ret;
++	}
++	default:
++		BUG();
++	}
++}
++
++static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
++				  unsigned parent,
++				  const char *name, unsigned namelen)
++{
++	unsigned i, nr_groups = disk_groups_nr(groups);
++
++	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
++		return -EINVAL;
++
++	for (i = 0; i < nr_groups; i++) {
++		struct bch_disk_group *g = groups->entries + i;
++
++		if (BCH_GROUP_DELETED(g))
++			continue;
++
++		if (!BCH_GROUP_DELETED(g) &&
++		    BCH_GROUP_PARENT(g) == parent &&
++		    strnlen(g->label, sizeof(g->label)) == namelen &&
++		    !memcmp(name, g->label, namelen))
++			return i;
++	}
++
++	return -1;
++}
++
++static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
++				 const char *name, unsigned namelen)
++{
++	struct bch_sb_field_disk_groups *groups =
++		bch2_sb_get_disk_groups(sb->sb);
++	unsigned i, nr_groups = disk_groups_nr(groups);
++	struct bch_disk_group *g;
++
++	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
++		return -EINVAL;
++
++	for (i = 0;
++	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
++	     i++)
++		;
++
++	if (i == nr_groups) {
++		unsigned u64s =
++			(sizeof(struct bch_sb_field_disk_groups) +
++			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
++			sizeof(u64);
++
++		groups = bch2_sb_resize_disk_groups(sb, u64s);
++		if (!groups)
++			return -ENOSPC;
++
++		nr_groups = disk_groups_nr(groups);
++	}
++
++	BUG_ON(i >= nr_groups);
++
++	g = &groups->entries[i];
++
++	memcpy(g->label, name, namelen);
++	if (namelen < sizeof(g->label))
++		g->label[namelen] = '\0';
++	SET_BCH_GROUP_DELETED(g, 0);
++	SET_BCH_GROUP_PARENT(g, parent);
++	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
++
++	return i;
++}
++
++int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
++{
++	struct bch_sb_field_disk_groups *groups =
++		bch2_sb_get_disk_groups(sb->sb);
++	int v = -1;
++
++	do {
++		const char *next = strchrnul(name, '.');
++		unsigned len = next - name;
++
++		if (*next == '.')
++			next++;
++
++		v = __bch2_disk_group_find(groups, v + 1, name, len);
++		name = next;
++	} while (*name && v >= 0);
++
++	return v;
++}
++
++int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
++{
++	struct bch_sb_field_disk_groups *groups;
++	unsigned parent = 0;
++	int v = -1;
++
++	do {
++		const char *next = strchrnul(name, '.');
++		unsigned len = next - name;
++
++		if (*next == '.')
++			next++;
++
++		groups = bch2_sb_get_disk_groups(sb->sb);
++
++		v = __bch2_disk_group_find(groups, parent, name, len);
++		if (v < 0)
++			v = __bch2_disk_group_add(sb, parent, name, len);
++		if (v < 0)
++			return v;
++
++		parent = v + 1;
++		name = next;
++	} while (*name && v >= 0);
++
++	return v;
++}
++
++void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
++{
++	struct bch_sb_field_disk_groups *groups =
++		bch2_sb_get_disk_groups(sb);
++	struct bch_disk_group *g;
++	unsigned nr = 0;
++	u16 path[32];
++
++	while (1) {
++		if (nr == ARRAY_SIZE(path))
++			goto inval;
++
++		if (v >= disk_groups_nr(groups))
++			goto inval;
++
++		g = groups->entries + v;
++
++		if (BCH_GROUP_DELETED(g))
++			goto inval;
++
++		path[nr++] = v;
++
++		if (!BCH_GROUP_PARENT(g))
++			break;
++
++		v = BCH_GROUP_PARENT(g) - 1;
++	}
++
++	while (nr) {
++		v = path[--nr];
++		g = groups->entries + v;
++
++		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
++		if (nr)
++			prt_printf(out, ".");
++	}
++	return;
++inval:
++	prt_printf(out, "invalid label %u", v);
++}
++
++int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
++{
++	struct bch_member *mi;
++	int v = -1;
++	int ret = 0;
++
++	mutex_lock(&c->sb_lock);
++
++	if (!strlen(name) || !strcmp(name, "none"))
++		goto write_sb;
++
++	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
++	if (v < 0) {
++		mutex_unlock(&c->sb_lock);
++		return v;
++	}
++
++	ret = bch2_sb_disk_groups_to_cpu(c);
++	if (ret)
++		goto unlock;
++write_sb:
++	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
++	SET_BCH_MEMBER_GROUP(mi, v + 1);
++
++	bch2_write_super(c);
++unlock:
++	mutex_unlock(&c->sb_lock);
++
++	return ret;
++}
++
++int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
++{
++	struct bch_dev *ca;
++	int g;
++
++	if (!strlen(buf) || !strcmp(buf, "none")) {
++		*v = 0;
++		return 0;
++	}
++
++	/* Is it a device? */
++	ca = bch2_dev_lookup(c, buf);
++	if (!IS_ERR(ca)) {
++		*v = dev_to_target(ca->dev_idx);
++		percpu_ref_put(&ca->ref);
++		return 0;
++	}
++
++	mutex_lock(&c->sb_lock);
++	g = bch2_disk_path_find(&c->disk_sb, buf);
++	mutex_unlock(&c->sb_lock);
++
++	if (g >= 0) {
++		*v = group_to_target(g);
++		return 0;
++	}
++
++	return -EINVAL;
++}
++
++void bch2_opt_target_to_text(struct printbuf *out,
++			     struct bch_fs *c,
++			     struct bch_sb *sb,
++			     u64 v)
++{
++	struct target t = target_decode(v);
++
++	switch (t.type) {
++	case TARGET_NULL:
++		prt_printf(out, "none");
++		break;
++	case TARGET_DEV:
++		if (c) {
++			struct bch_dev *ca;
++
++			rcu_read_lock();
++			ca = t.dev < c->sb.nr_devices
++				? rcu_dereference(c->devs[t.dev])
++				: NULL;
++
++			if (ca && percpu_ref_tryget(&ca->io_ref)) {
++				char b[BDEVNAME_SIZE];
++
++				prt_printf(out, "/dev/%s",
++				       bdevname(ca->disk_sb.bdev, b));
++				percpu_ref_put(&ca->io_ref);
++			} else if (ca) {
++				prt_printf(out, "offline device %u", t.dev);
++			} else {
++				prt_printf(out, "invalid device %u", t.dev);
++			}
++
++			rcu_read_unlock();
++		} else {
++			struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
++			struct bch_member *m = mi->members + t.dev;
++
++			if (bch2_dev_exists(sb, mi, t.dev)) {
++				prt_printf(out, "Device ");
++				pr_uuid(out, m->uuid.b);
++				prt_printf(out, " (%u)", t.dev);
++			} else {
++				prt_printf(out, "Bad device %u", t.dev);
++			}
++		}
++		break;
++	case TARGET_GROUP:
++		if (c) {
++			mutex_lock(&c->sb_lock);
++			bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
++			mutex_unlock(&c->sb_lock);
++		} else {
++			bch2_disk_path_to_text(out, sb, t.group);
++		}
++		break;
++	default:
++		BUG();
++	}
++}
+diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
+new file mode 100644
+index 000000000000..de915480514b
+--- /dev/null
++++ b/fs/bcachefs/disk_groups.h
+@@ -0,0 +1,90 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_DISK_GROUPS_H
++#define _BCACHEFS_DISK_GROUPS_H
++
++extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
++
++static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
++{
++	return groups
++		? (vstruct_end(&groups->field) -
++		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
++		: 0;
++}
++
++struct target {
++	enum {
++		TARGET_NULL,
++		TARGET_DEV,
++		TARGET_GROUP,
++	}			type;
++	union {
++		unsigned	dev;
++		unsigned	group;
++	};
++};
++
++#define TARGET_DEV_START	1
++#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
++
++static inline u16 dev_to_target(unsigned dev)
++{
++	return TARGET_DEV_START + dev;
++}
++
++static inline u16 group_to_target(unsigned group)
++{
++	return TARGET_GROUP_START + group;
++}
++
++static inline struct target target_decode(unsigned target)
++{
++	if (target >= TARGET_GROUP_START)
++		return (struct target) {
++			.type	= TARGET_GROUP,
++			.group	= target - TARGET_GROUP_START
++		};
++
++	if (target >= TARGET_DEV_START)
++		return (struct target) {
++			.type	= TARGET_DEV,
++			.group	= target - TARGET_DEV_START
++		};
++
++	return (struct target) { .type = TARGET_NULL };
++}
++
++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
++
++static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
++						  enum bch_data_type data_type,
++						  u16 target)
++{
++	struct bch_devs_mask devs = c->rw_devs[data_type];
++	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
++
++	if (t)
++		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
++	return devs;
++}
++
++bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
++
++int bch2_disk_path_find(struct bch_sb_handle *, const char *);
++
++/* Exported for userspace bcachefs-tools: */
++int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
++
++void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
++
++int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
++void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
++
++int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
++
++int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
++
++const char *bch2_sb_validate_disk_groups(struct bch_sb *,
++					 struct bch_sb_field *);
++
++#endif /* _BCACHEFS_DISK_GROUPS_H */
+diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
+new file mode 100644
+index 000000000000..f33acf1af110
+--- /dev/null
++++ b/fs/bcachefs/ec.c
+@@ -0,0 +1,1673 @@
++// SPDX-License-Identifier: GPL-2.0
++
++/* erasure coding */
++
++#include "bcachefs.h"
++#include "alloc_foreground.h"
++#include "bkey_buf.h"
++#include "bset.h"
++#include "btree_gc.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "disk_groups.h"
++#include "ec.h"
++#include "error.h"
++#include "io.h"
++#include "keylist.h"
++#include "recovery.h"
++#include "replicas.h"
++#include "super-io.h"
++#include "util.h"
++
++#include <linux/sort.h>
++
++#ifdef __KERNEL__
++
++#include <linux/raid/pq.h>
++#include <linux/raid/xor.h>
++
++static void raid5_recov(unsigned disks, unsigned failed_idx,
++			size_t size, void **data)
++{
++	unsigned i = 2, nr;
++
++	BUG_ON(failed_idx >= disks);
++
++	swap(data[0], data[failed_idx]);
++	memcpy(data[0], data[1], size);
++
++	while (i < disks) {
++		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
++		xor_blocks(nr, size, data[0], data + i);
++		i += nr;
++	}
++
++	swap(data[0], data[failed_idx]);
++}
++
++static void raid_gen(int nd, int np, size_t size, void **v)
++{
++	if (np >= 1)
++		raid5_recov(nd + np, nd, size, v);
++	if (np >= 2)
++		raid6_call.gen_syndrome(nd + np, size, v);
++	BUG_ON(np > 2);
++}
++
++static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
++{
++	switch (nr) {
++	case 0:
++		break;
++	case 1:
++		if (ir[0] < nd + 1)
++			raid5_recov(nd + 1, ir[0], size, v);
++		else
++			raid6_call.gen_syndrome(nd + np, size, v);
++		break;
++	case 2:
++		if (ir[1] < nd) {
++			/* data+data failure. */
++			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
++		} else if (ir[0] < nd) {
++			/* data + p/q failure */
++
++			if (ir[1] == nd) /* data + p failure */
++				raid6_datap_recov(nd + np, size, ir[0], v);
++			else { /* data + q failure */
++				raid5_recov(nd + 1, ir[0], size, v);
++				raid6_call.gen_syndrome(nd + np, size, v);
++			}
++		} else {
++			raid_gen(nd, np, size, v);
++		}
++		break;
++	default:
++		BUG();
++	}
++}
++
++#else
++
++#include <raid/raid.h>
++
++#endif
++
++struct ec_bio {
++	struct bch_dev		*ca;
++	struct ec_stripe_buf	*buf;
++	size_t			idx;
++	struct bio		bio;
++};
++
++/* Stripes btree keys: */
++
++int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			int rw, struct printbuf *err)
++{
++	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
++
++	if (!bkey_cmp(k.k->p, POS_MIN)) {
++		prt_printf(err, "stripe at POS_MIN");
++		return -EINVAL;
++	}
++
++	if (k.k->p.inode) {
++		prt_printf(err, "nonzero inode field");
++		return -EINVAL;
++	}
++
++	if (bkey_val_bytes(k.k) < sizeof(*s)) {
++		prt_printf(err, "incorrect value size (%zu < %zu)",
++		       bkey_val_bytes(k.k), sizeof(*s));
++		return -EINVAL;
++	}
++
++	if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
++		prt_printf(err, "incorrect value size (%zu < %u)",
++		       bkey_val_u64s(k.k), stripe_val_u64s(s));
++		return -EINVAL;
++	}
++
++	return bch2_bkey_ptrs_invalid(c, k, rw, err);
++}
++
++void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
++			 struct bkey_s_c k)
++{
++	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
++	unsigned i;
++
++	prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
++	       s->algorithm,
++	       le16_to_cpu(s->sectors),
++	       s->nr_blocks - s->nr_redundant,
++	       s->nr_redundant,
++	       s->csum_type,
++	       1U << s->csum_granularity_bits);
++
++	for (i = 0; i < s->nr_blocks; i++)
++		prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev,
++		       (u64) s->ptrs[i].offset,
++		       stripe_blockcount_get(s, i));
++}
++
++/* returns blocknr in stripe that we matched: */
++static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
++						struct bkey_s_c k, unsigned *block)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr;
++	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
++
++	bkey_for_each_ptr(ptrs, ptr)
++		for (i = 0; i < nr_data; i++)
++			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
++						      le16_to_cpu(s->sectors))) {
++				*block = i;
++				return ptr;
++			}
++
++	return NULL;
++}
++
++static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
++{
++	switch (k.k->type) {
++	case KEY_TYPE_extent: {
++		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
++		const union bch_extent_entry *entry;
++
++		extent_for_each_entry(e, entry)
++			if (extent_entry_type(entry) ==
++			    BCH_EXTENT_ENTRY_stripe_ptr &&
++			    entry->stripe_ptr.idx == idx)
++				return true;
++
++		break;
++	}
++	}
++
++	return false;
++}
++
++/* Stripe bufs: */
++
++static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
++{
++	unsigned i;
++
++	for (i = 0; i < buf->key.v.nr_blocks; i++) {
++		kvpfree(buf->data[i], buf->size << 9);
++		buf->data[i] = NULL;
++	}
++}
++
++static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
++			       unsigned offset, unsigned size)
++{
++	struct bch_stripe *v = &buf->key.v;
++	unsigned csum_granularity = 1U << v->csum_granularity_bits;
++	unsigned end = offset + size;
++	unsigned i;
++
++	BUG_ON(end > le16_to_cpu(v->sectors));
++
++	offset	= round_down(offset, csum_granularity);
++	end	= min_t(unsigned, le16_to_cpu(v->sectors),
++			round_up(end, csum_granularity));
++
++	buf->offset	= offset;
++	buf->size	= end - offset;
++
++	memset(buf->valid, 0xFF, sizeof(buf->valid));
++
++	for (i = 0; i < buf->key.v.nr_blocks; i++) {
++		buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
++		if (!buf->data[i])
++			goto err;
++	}
++
++	return 0;
++err:
++	ec_stripe_buf_exit(buf);
++	return -ENOMEM;
++}
++
++/* Checksumming: */
++
++static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
++					 unsigned block, unsigned offset)
++{
++	struct bch_stripe *v = &buf->key.v;
++	unsigned csum_granularity = 1 << v->csum_granularity_bits;
++	unsigned end = buf->offset + buf->size;
++	unsigned len = min(csum_granularity, end - offset);
++
++	BUG_ON(offset >= end);
++	BUG_ON(offset <  buf->offset);
++	BUG_ON(offset & (csum_granularity - 1));
++	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
++	       (len & (csum_granularity - 1)));
++
++	return bch2_checksum(NULL, v->csum_type,
++			     null_nonce(),
++			     buf->data[block] + ((offset - buf->offset) << 9),
++			     len << 9);
++}
++
++static void ec_generate_checksums(struct ec_stripe_buf *buf)
++{
++	struct bch_stripe *v = &buf->key.v;
++	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
++
++	if (!v->csum_type)
++		return;
++
++	BUG_ON(buf->offset);
++	BUG_ON(buf->size != le16_to_cpu(v->sectors));
++
++	for (i = 0; i < v->nr_blocks; i++)
++		for (j = 0; j < csums_per_device; j++)
++			stripe_csum_set(v, i, j,
++				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
++}
++
++static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
++{
++	struct bch_stripe *v = &buf->key.v;
++	unsigned csum_granularity = 1 << v->csum_granularity_bits;
++	unsigned i;
++
++	if (!v->csum_type)
++		return;
++
++	for (i = 0; i < v->nr_blocks; i++) {
++		unsigned offset = buf->offset;
++		unsigned end = buf->offset + buf->size;
++
++		if (!test_bit(i, buf->valid))
++			continue;
++
++		while (offset < end) {
++			unsigned j = offset >> v->csum_granularity_bits;
++			unsigned len = min(csum_granularity, end - offset);
++			struct bch_csum want = stripe_csum_get(v, i, j);
++			struct bch_csum got = ec_block_checksum(buf, i, offset);
++
++			if (bch2_crc_cmp(want, got)) {
++				struct printbuf buf2 = PRINTBUF;
++
++				bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i));
++
++				bch_err_ratelimited(c,
++					"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
++					(void *) _RET_IP_, i, j, v->csum_type,
++					want.lo, got.lo, buf2.buf);
++				printbuf_exit(&buf2);
++				clear_bit(i, buf->valid);
++				break;
++			}
++
++			offset += len;
++		}
++	}
++}
++
++/* Erasure coding: */
++
++static void ec_generate_ec(struct ec_stripe_buf *buf)
++{
++	struct bch_stripe *v = &buf->key.v;
++	unsigned nr_data = v->nr_blocks - v->nr_redundant;
++	unsigned bytes = le16_to_cpu(v->sectors) << 9;
++
++	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
++}
++
++static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
++{
++	return buf->key.v.nr_blocks -
++		bitmap_weight(buf->valid, buf->key.v.nr_blocks);
++}
++
++static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
++{
++	struct bch_stripe *v = &buf->key.v;
++	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
++	unsigned nr_data = v->nr_blocks - v->nr_redundant;
++	unsigned bytes = buf->size << 9;
++
++	if (ec_nr_failed(buf) > v->nr_redundant) {
++		bch_err_ratelimited(c,
++			"error doing reconstruct read: unable to read enough blocks");
++		return -1;
++	}
++
++	for (i = 0; i < nr_data; i++)
++		if (!test_bit(i, buf->valid))
++			failed[nr_failed++] = i;
++
++	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
++	return 0;
++}
++
++/* IO: */
++
++static void ec_block_endio(struct bio *bio)
++{
++	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
++	struct bch_stripe *v = &ec_bio->buf->key.v;
++	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
++	struct bch_dev *ca = ec_bio->ca;
++	struct closure *cl = bio->bi_private;
++
++	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
++			       bio_data_dir(bio) ? "write" : "read",
++			       bch2_blk_status_to_str(bio->bi_status)))
++		clear_bit(ec_bio->idx, ec_bio->buf->valid);
++
++	if (ptr_stale(ca, ptr)) {
++		bch_err_ratelimited(ca->fs,
++				    "error %s stripe: stale pointer after io",
++				    bio_data_dir(bio) == READ ? "reading from" : "writing to");
++		clear_bit(ec_bio->idx, ec_bio->buf->valid);
++	}
++
++	bio_put(&ec_bio->bio);
++	percpu_ref_put(&ca->io_ref);
++	closure_put(cl);
++}
++
++static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
++			unsigned rw, unsigned idx, struct closure *cl)
++{
++	struct bch_stripe *v = &buf->key.v;
++	unsigned offset = 0, bytes = buf->size << 9;
++	struct bch_extent_ptr *ptr = &v->ptrs[idx];
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++	enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
++		? BCH_DATA_user
++		: BCH_DATA_parity;
++
++	if (ptr_stale(ca, ptr)) {
++		bch_err_ratelimited(c,
++				    "error %s stripe: stale pointer",
++				    rw == READ ? "reading from" : "writing to");
++		clear_bit(idx, buf->valid);
++		return;
++	}
++
++	if (!bch2_dev_get_ioref(ca, rw)) {
++		clear_bit(idx, buf->valid);
++		return;
++	}
++
++	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
++
++	while (offset < bytes) {
++		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
++					   DIV_ROUND_UP(bytes, PAGE_SIZE));
++		unsigned b = min_t(size_t, bytes - offset,
++				   nr_iovecs << PAGE_SHIFT);
++		struct ec_bio *ec_bio;
++
++		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
++						       nr_iovecs,
++						       rw,
++						       GFP_KERNEL,
++						       &c->ec_bioset),
++				      struct ec_bio, bio);
++
++		ec_bio->ca			= ca;
++		ec_bio->buf			= buf;
++		ec_bio->idx			= idx;
++
++		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
++		ec_bio->bio.bi_end_io		= ec_block_endio;
++		ec_bio->bio.bi_private		= cl;
++
++		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
++
++		closure_get(cl);
++		percpu_ref_get(&ca->io_ref);
++
++		submit_bio(&ec_bio->bio);
++
++		offset += b;
++	}
++
++	percpu_ref_put(&ca->io_ref);
++}
++
++static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes,
++			     POS(0, idx), BTREE_ITER_SLOTS);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++	if (k.k->type != KEY_TYPE_stripe) {
++		ret = -ENOENT;
++		goto err;
++	}
++	bkey_reassemble(&stripe->key.k_i, k);
++err:
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++/* recovery read path: */
++int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
++{
++	struct ec_stripe_buf *buf;
++	struct closure cl;
++	struct bch_stripe *v;
++	unsigned i, offset;
++	int ret = 0;
++
++	closure_init_stack(&cl);
++
++	BUG_ON(!rbio->pick.has_ec);
++
++	buf = kzalloc(sizeof(*buf), GFP_NOIO);
++	if (!buf)
++		return -ENOMEM;
++
++	ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
++	if (ret) {
++		bch_err_ratelimited(c,
++			"error doing reconstruct read: error %i looking up stripe", ret);
++		kfree(buf);
++		return -EIO;
++	}
++
++	v = &buf->key.v;
++
++	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
++		bch_err_ratelimited(c,
++			"error doing reconstruct read: pointer doesn't match stripe");
++		ret = -EIO;
++		goto err;
++	}
++
++	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
++	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
++		bch_err_ratelimited(c,
++			"error doing reconstruct read: read is bigger than stripe");
++		ret = -EIO;
++		goto err;
++	}
++
++	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
++	if (ret)
++		goto err;
++
++	for (i = 0; i < v->nr_blocks; i++)
++		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
++
++	closure_sync(&cl);
++
++	if (ec_nr_failed(buf) > v->nr_redundant) {
++		bch_err_ratelimited(c,
++			"error doing reconstruct read: unable to read enough blocks");
++		ret = -EIO;
++		goto err;
++	}
++
++	ec_validate_checksums(c, buf);
++
++	ret = ec_do_recov(c, buf);
++	if (ret)
++		goto err;
++
++	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
++		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
++err:
++	ec_stripe_buf_exit(buf);
++	kfree(buf);
++	return ret;
++}
++
++/* stripe bucket accounting: */
++
++static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
++{
++	ec_stripes_heap n, *h = &c->ec_stripes_heap;
++
++	if (idx >= h->size) {
++		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
++			return -ENOMEM;
++
++		spin_lock(&c->ec_stripes_heap_lock);
++		if (n.size > h->size) {
++			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
++			n.used = h->used;
++			swap(*h, n);
++		}
++		spin_unlock(&c->ec_stripes_heap_lock);
++
++		free_heap(&n);
++	}
++
++	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
++		return -ENOMEM;
++
++	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
++	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
++		return -ENOMEM;
++
++	return 0;
++}
++
++static int ec_stripe_mem_alloc(struct btree_trans *trans,
++			       struct btree_iter *iter)
++{
++	size_t idx = iter->pos.offset;
++
++	if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
++		return 0;
++
++	bch2_trans_unlock(trans);
++
++	return   __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?:
++		bch2_trans_relock(trans);
++}
++
++static ssize_t stripe_idx_to_delete(struct bch_fs *c)
++{
++	ec_stripes_heap *h = &c->ec_stripes_heap;
++
++	return h->used && h->data[0].blocks_nonempty == 0
++		? h->data[0].idx : -1;
++}
++
++static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
++				      struct ec_stripe_heap_entry l,
++				      struct ec_stripe_heap_entry r)
++{
++	return ((l.blocks_nonempty > r.blocks_nonempty) -
++		(l.blocks_nonempty < r.blocks_nonempty));
++}
++
++static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
++						   size_t i)
++{
++	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
++
++	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
++}
++
++static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
++{
++	ec_stripes_heap *h = &c->ec_stripes_heap;
++	struct stripe *m = genradix_ptr(&c->stripes, idx);
++
++	BUG_ON(!m->alive);
++	BUG_ON(m->heap_idx >= h->used);
++	BUG_ON(h->data[m->heap_idx].idx != idx);
++}
++
++void bch2_stripes_heap_del(struct bch_fs *c,
++			   struct stripe *m, size_t idx)
++{
++	if (!m->on_heap)
++		return;
++
++	m->on_heap = false;
++
++	heap_verify_backpointer(c, idx);
++
++	heap_del(&c->ec_stripes_heap, m->heap_idx,
++		 ec_stripes_heap_cmp,
++		 ec_stripes_heap_set_backpointer);
++}
++
++void bch2_stripes_heap_insert(struct bch_fs *c,
++			      struct stripe *m, size_t idx)
++{
++	if (m->on_heap)
++		return;
++
++	BUG_ON(heap_full(&c->ec_stripes_heap));
++
++	m->on_heap = true;
++
++	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
++			.idx = idx,
++			.blocks_nonempty = m->blocks_nonempty,
++		}),
++		 ec_stripes_heap_cmp,
++		 ec_stripes_heap_set_backpointer);
++
++	heap_verify_backpointer(c, idx);
++}
++
++void bch2_stripes_heap_update(struct bch_fs *c,
++			      struct stripe *m, size_t idx)
++{
++	ec_stripes_heap *h = &c->ec_stripes_heap;
++	size_t i;
++
++	if (!m->on_heap)
++		return;
++
++	heap_verify_backpointer(c, idx);
++
++	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
++
++	i = m->heap_idx;
++	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
++		     ec_stripes_heap_set_backpointer);
++	heap_sift_down(h, i, ec_stripes_heap_cmp,
++		       ec_stripes_heap_set_backpointer);
++
++	heap_verify_backpointer(c, idx);
++
++	if (stripe_idx_to_delete(c) >= 0 &&
++	    !percpu_ref_is_dying(&c->writes))
++		schedule_work(&c->ec_stripe_delete_work);
++}
++
++/* stripe deletion */
++
++static int ec_stripe_delete(struct bch_fs *c, size_t idx)
++{
++	return bch2_btree_delete_range(c, BTREE_ID_stripes,
++				       POS(0, idx),
++				       POS(0, idx + 1),
++				       0, NULL);
++}
++
++static void ec_stripe_delete_work(struct work_struct *work)
++{
++	struct bch_fs *c =
++		container_of(work, struct bch_fs, ec_stripe_delete_work);
++	ssize_t idx;
++
++	while (1) {
++		spin_lock(&c->ec_stripes_heap_lock);
++		idx = stripe_idx_to_delete(c);
++		if (idx < 0) {
++			spin_unlock(&c->ec_stripes_heap_lock);
++			break;
++		}
++
++		bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
++		spin_unlock(&c->ec_stripes_heap_lock);
++
++		if (ec_stripe_delete(c, idx))
++			break;
++	}
++}
++
++/* stripe creation: */
++
++static int ec_stripe_bkey_insert(struct btree_trans *trans,
++				 struct bkey_i_stripe *stripe,
++				 struct disk_reservation *res)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bpos min_pos = POS(0, 1);
++	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
++	int ret;
++
++	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
++			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
++		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
++			if (start_pos.offset) {
++				start_pos = min_pos;
++				bch2_btree_iter_set_pos(&iter, start_pos);
++				continue;
++			}
++
++			ret = -ENOSPC;
++			break;
++		}
++
++		if (bkey_deleted(k.k))
++			break;
++	}
++
++	c->ec_stripe_hint = iter.pos.offset;
++
++	if (ret)
++		goto err;
++
++	ret = ec_stripe_mem_alloc(trans, &iter);
++	if (ret)
++		goto err;
++
++	stripe->k.p = iter.pos;
++
++	ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++static int ec_stripe_bkey_update(struct btree_trans *trans,
++				 struct bkey_i_stripe *new,
++				 struct disk_reservation *res)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	const struct bch_stripe *existing;
++	unsigned i;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
++			     new->k.p, BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (!k.k || k.k->type != KEY_TYPE_stripe) {
++		bch_err(trans->c, "error updating stripe: not found");
++		ret = -ENOENT;
++		goto err;
++	}
++
++	existing = bkey_s_c_to_stripe(k).v;
++
++	if (existing->nr_blocks != new->v.nr_blocks) {
++		bch_err(trans->c, "error updating stripe: nr_blocks does not match");
++		ret = -EINVAL;
++		goto err;
++	}
++
++	for (i = 0; i < new->v.nr_blocks; i++)
++		stripe_blockcount_set(&new->v, i,
++			stripe_blockcount_get(existing, i));
++
++	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static void extent_stripe_ptr_add(struct bkey_s_extent e,
++				  struct ec_stripe_buf *s,
++				  struct bch_extent_ptr *ptr,
++				  unsigned block)
++{
++	struct bch_extent_stripe_ptr *dst = (void *) ptr;
++	union bch_extent_entry *end = extent_entry_last(e);
++
++	memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
++	e.k->u64s += sizeof(*dst) / sizeof(u64);
++
++	*dst = (struct bch_extent_stripe_ptr) {
++		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
++		.block		= block,
++		.redundancy	= s->key.v.nr_redundant,
++		.idx		= s->key.k.p.offset,
++	};
++}
++
++static int ec_stripe_update_extent(struct btree_trans *trans,
++				   struct btree_iter *iter,
++				   struct bkey_s_c k,
++				   struct ec_stripe_buf *s,
++				   struct bpos end)
++{
++	const struct bch_extent_ptr *ptr_c;
++	struct bch_extent_ptr *ptr, *ec_ptr = NULL;
++	struct bkey_i *n;
++	int ret, dev, block;
++
++	if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
++		return 1;
++
++	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
++		return 0;
++
++	ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
++	/*
++	 * It doesn't generally make sense to erasure code cached ptrs:
++	 * XXX: should we be incrementing a counter?
++	 */
++	if (!ptr_c || ptr_c->cached)
++		return 0;
++
++	dev = s->key.v.ptrs[block].dev;
++
++	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(n);
++	if (ret)
++		return ret;
++
++	bkey_reassemble(n, k);
++
++	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
++	ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
++	BUG_ON(!ec_ptr);
++
++	extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block);
++
++	return bch2_trans_update(trans, iter, n, 0);
++}
++
++static int ec_stripe_update_extents(struct bch_fs *c,
++				 struct ec_stripe_buf *s,
++				 struct bkey *pos)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++
++	return bch2_trans_run(c,
++		for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_extents, bkey_start_pos(pos),
++			BTREE_ITER_NOT_EXTENTS|BTREE_ITER_INTENT, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL,
++		ec_stripe_update_extent(&trans, &iter, k, s, pos->p)));
++}
++
++/*
++ * data buckets of new stripe all written: create the stripe
++ */
++static void ec_stripe_create(struct ec_stripe_new *s)
++{
++	struct bch_fs *c = s->c;
++	struct open_bucket *ob;
++	struct bkey_i *k;
++	struct stripe *m;
++	struct bch_stripe *v = &s->new_stripe.key.v;
++	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
++	int ret;
++
++	BUG_ON(s->h->s == s);
++
++	closure_sync(&s->iodone);
++
++	if (s->err) {
++		if (s->err != -EROFS)
++			bch_err(c, "error creating stripe: error writing data buckets");
++		goto err;
++	}
++
++	if (s->have_existing_stripe) {
++		ec_validate_checksums(c, &s->existing_stripe);
++
++		if (ec_do_recov(c, &s->existing_stripe)) {
++			bch_err(c, "error creating stripe: error reading existing stripe");
++			goto err;
++		}
++
++		for (i = 0; i < nr_data; i++)
++			if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
++				swap(s->new_stripe.data[i],
++				     s->existing_stripe.data[i]);
++
++		ec_stripe_buf_exit(&s->existing_stripe);
++	}
++
++	BUG_ON(!s->allocated);
++
++	if (!percpu_ref_tryget_live(&c->writes))
++		goto err;
++
++	ec_generate_ec(&s->new_stripe);
++
++	ec_generate_checksums(&s->new_stripe);
++
++	/* write p/q: */
++	for (i = nr_data; i < v->nr_blocks; i++)
++		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
++	closure_sync(&s->iodone);
++
++	if (ec_nr_failed(&s->new_stripe)) {
++		bch_err(c, "error creating stripe: error writing redundancy buckets");
++		goto err_put_writes;
++	}
++
++	ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
++			    s->have_existing_stripe
++			    ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res)
++			    : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res));
++	if (ret) {
++		bch_err(c, "error creating stripe: error creating stripe key");
++		goto err_put_writes;
++	}
++
++	for_each_keylist_key(&s->keys, k) {
++		ret = ec_stripe_update_extents(c, &s->new_stripe, &k->k);
++		if (ret) {
++			bch_err(c, "error creating stripe: error updating pointers: %s",
++				bch2_err_str(ret));
++			break;
++		}
++	}
++
++	spin_lock(&c->ec_stripes_heap_lock);
++	m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
++
++	BUG_ON(m->on_heap);
++	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
++	spin_unlock(&c->ec_stripes_heap_lock);
++err_put_writes:
++	percpu_ref_put(&c->writes);
++err:
++	bch2_disk_reservation_put(c, &s->res);
++
++	for (i = 0; i < v->nr_blocks; i++)
++		if (s->blocks[i]) {
++			ob = c->open_buckets + s->blocks[i];
++
++			if (i < nr_data) {
++				ob->ec = NULL;
++				__bch2_open_bucket_put(c, ob);
++			} else {
++				bch2_open_bucket_put(c, ob);
++			}
++		}
++
++	bch2_keylist_free(&s->keys, s->inline_keys);
++
++	ec_stripe_buf_exit(&s->existing_stripe);
++	ec_stripe_buf_exit(&s->new_stripe);
++	closure_debug_destroy(&s->iodone);
++	kfree(s);
++}
++
++static void ec_stripe_create_work(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work,
++		struct bch_fs, ec_stripe_create_work);
++	struct ec_stripe_new *s, *n;
++restart:
++	mutex_lock(&c->ec_stripe_new_lock);
++	list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list)
++		if (!atomic_read(&s->pin)) {
++			list_del(&s->list);
++			mutex_unlock(&c->ec_stripe_new_lock);
++			ec_stripe_create(s);
++			goto restart;
++		}
++	mutex_unlock(&c->ec_stripe_new_lock);
++}
++
++static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
++{
++	BUG_ON(atomic_read(&s->pin) <= 0);
++
++	if (atomic_dec_and_test(&s->pin)) {
++		BUG_ON(!s->pending);
++		queue_work(system_long_wq, &c->ec_stripe_create_work);
++	}
++}
++
++static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
++{
++	struct ec_stripe_new *s = h->s;
++
++	BUG_ON(!s->allocated && !s->err);
++
++	h->s		= NULL;
++	s->pending	= true;
++
++	mutex_lock(&c->ec_stripe_new_lock);
++	list_add(&s->list, &c->ec_stripe_new_list);
++	mutex_unlock(&c->ec_stripe_new_lock);
++
++	ec_stripe_new_put(c, s);
++}
++
++/* have a full bucket - hand it off to be erasure coded: */
++void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
++{
++	struct ec_stripe_new *s = ob->ec;
++
++	if (ob->sectors_free)
++		s->err = -1;
++
++	ec_stripe_new_put(c, s);
++}
++
++void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
++{
++	struct ec_stripe_new *s = ob->ec;
++
++	s->err = -EIO;
++}
++
++void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
++{
++	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
++	struct bch_dev *ca;
++	unsigned offset;
++
++	if (!ob)
++		return NULL;
++
++	ca	= bch_dev_bkey_exists(c, ob->dev);
++	offset	= ca->mi.bucket_size - ob->sectors_free;
++
++	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
++}
++
++void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
++			     struct bkey *k)
++{
++	struct ec_stripe_new *ec = ob->ec;
++
++	if (!ec)
++		return;
++
++	mutex_lock(&ec->lock);
++
++	if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
++				 ARRAY_SIZE(ec->inline_keys),
++				 BKEY_U64s)) {
++		BUG();
++	}
++
++	bkey_init(&ec->keys.top->k);
++	ec->keys.top->k.p	= k->p;
++	ec->keys.top->k.size	= k->size;
++	bch2_keylist_push(&ec->keys);
++
++	mutex_unlock(&ec->lock);
++}
++
++static int unsigned_cmp(const void *_l, const void *_r)
++{
++	unsigned l = *((const unsigned *) _l);
++	unsigned r = *((const unsigned *) _r);
++
++	return cmp_int(l, r);
++}
++
++/* pick most common bucket size: */
++static unsigned pick_blocksize(struct bch_fs *c,
++			       struct bch_devs_mask *devs)
++{
++	struct bch_dev *ca;
++	unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
++	struct {
++		unsigned nr, size;
++	} cur = { 0, 0 }, best = { 0, 0 };
++
++	for_each_member_device_rcu(ca, c, i, devs)
++		sizes[nr++] = ca->mi.bucket_size;
++
++	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
++
++	for (i = 0; i < nr; i++) {
++		if (sizes[i] != cur.size) {
++			if (cur.nr > best.nr)
++				best = cur;
++
++			cur.nr = 0;
++			cur.size = sizes[i];
++		}
++
++		cur.nr++;
++	}
++
++	if (cur.nr > best.nr)
++		best = cur;
++
++	return best.size;
++}
++
++static bool may_create_new_stripe(struct bch_fs *c)
++{
++	return false;
++}
++
++static void ec_stripe_key_init(struct bch_fs *c,
++			       struct bkey_i_stripe *s,
++			       unsigned nr_data,
++			       unsigned nr_parity,
++			       unsigned stripe_size)
++{
++	unsigned u64s;
++
++	bkey_stripe_init(&s->k_i);
++	s->v.sectors			= cpu_to_le16(stripe_size);
++	s->v.algorithm			= 0;
++	s->v.nr_blocks			= nr_data + nr_parity;
++	s->v.nr_redundant		= nr_parity;
++	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
++	s->v.csum_type			= BCH_CSUM_crc32c;
++	s->v.pad			= 0;
++
++	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
++		BUG_ON(1 << s->v.csum_granularity_bits >=
++		       le16_to_cpu(s->v.sectors) ||
++		       s->v.csum_granularity_bits == U8_MAX);
++		s->v.csum_granularity_bits++;
++	}
++
++	set_bkey_val_u64s(&s->k, u64s);
++}
++
++static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
++{
++	struct ec_stripe_new *s;
++
++	lockdep_assert_held(&h->lock);
++
++	s = kzalloc(sizeof(*s), GFP_KERNEL);
++	if (!s)
++		return -ENOMEM;
++
++	mutex_init(&s->lock);
++	closure_init(&s->iodone, NULL);
++	atomic_set(&s->pin, 1);
++	s->c		= c;
++	s->h		= h;
++	s->nr_data	= min_t(unsigned, h->nr_active_devs,
++				BCH_BKEY_PTRS_MAX) - h->redundancy;
++	s->nr_parity	= h->redundancy;
++
++	bch2_keylist_init(&s->keys, s->inline_keys);
++
++	ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
++			   s->nr_parity, h->blocksize);
++
++	h->s = s;
++	return 0;
++}
++
++static struct ec_stripe_head *
++ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
++			 unsigned algo, unsigned redundancy,
++			 bool copygc)
++{
++	struct ec_stripe_head *h;
++	struct bch_dev *ca;
++	unsigned i;
++
++	h = kzalloc(sizeof(*h), GFP_KERNEL);
++	if (!h)
++		return NULL;
++
++	mutex_init(&h->lock);
++	mutex_lock(&h->lock);
++
++	h->target	= target;
++	h->algo		= algo;
++	h->redundancy	= redundancy;
++	h->copygc	= copygc;
++
++	rcu_read_lock();
++	h->devs = target_rw_devs(c, BCH_DATA_user, target);
++
++	for_each_member_device_rcu(ca, c, i, &h->devs)
++		if (!ca->mi.durability)
++			__clear_bit(i, h->devs.d);
++
++	h->blocksize = pick_blocksize(c, &h->devs);
++
++	for_each_member_device_rcu(ca, c, i, &h->devs)
++		if (ca->mi.bucket_size == h->blocksize)
++			h->nr_active_devs++;
++
++	rcu_read_unlock();
++	list_add(&h->list, &c->ec_stripe_head_list);
++	return h;
++}
++
++void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
++{
++	if (h->s &&
++	    h->s->allocated &&
++	    bitmap_weight(h->s->blocks_allocated,
++			  h->s->nr_data) == h->s->nr_data)
++		ec_stripe_set_pending(c, h);
++
++	mutex_unlock(&h->lock);
++}
++
++struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
++						 unsigned target,
++						 unsigned algo,
++						 unsigned redundancy,
++						 bool copygc)
++{
++	struct ec_stripe_head *h;
++
++	if (!redundancy)
++		return NULL;
++
++	mutex_lock(&c->ec_stripe_head_lock);
++	list_for_each_entry(h, &c->ec_stripe_head_list, list)
++		if (h->target		== target &&
++		    h->algo		== algo &&
++		    h->redundancy	== redundancy &&
++		    h->copygc		== copygc) {
++			mutex_lock(&h->lock);
++			goto found;
++		}
++
++	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
++found:
++	mutex_unlock(&c->ec_stripe_head_lock);
++	return h;
++}
++
++static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
++				    struct closure *cl)
++{
++	struct bch_devs_mask devs = h->devs;
++	struct open_bucket *ob;
++	struct open_buckets buckets;
++	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
++	bool have_cache = true;
++	int ret = 0;
++
++	for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
++		if (test_bit(i, h->s->blocks_gotten)) {
++			__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
++			if (i < h->s->nr_data)
++				nr_have_data++;
++			else
++				nr_have_parity++;
++		}
++	}
++
++	BUG_ON(nr_have_data	> h->s->nr_data);
++	BUG_ON(nr_have_parity	> h->s->nr_parity);
++
++	buckets.nr = 0;
++	if (nr_have_parity < h->s->nr_parity) {
++		ret = bch2_bucket_alloc_set(c, &buckets,
++					    &h->parity_stripe,
++					    &devs,
++					    h->s->nr_parity,
++					    &nr_have_parity,
++					    &have_cache,
++					    h->copygc
++					    ? RESERVE_movinggc
++					    : RESERVE_none,
++					    0,
++					    cl);
++
++		open_bucket_for_each(c, &buckets, ob, i) {
++			j = find_next_zero_bit(h->s->blocks_gotten,
++					       h->s->nr_data + h->s->nr_parity,
++					       h->s->nr_data);
++			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
++
++			h->s->blocks[j] = buckets.v[i];
++			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
++			__set_bit(j, h->s->blocks_gotten);
++		}
++
++		if (ret)
++			return ret;
++	}
++
++	buckets.nr = 0;
++	if (nr_have_data < h->s->nr_data) {
++		ret = bch2_bucket_alloc_set(c, &buckets,
++					    &h->block_stripe,
++					    &devs,
++					    h->s->nr_data,
++					    &nr_have_data,
++					    &have_cache,
++					    h->copygc
++					    ? RESERVE_movinggc
++					    : RESERVE_none,
++					    0,
++					    cl);
++
++		open_bucket_for_each(c, &buckets, ob, i) {
++			j = find_next_zero_bit(h->s->blocks_gotten,
++					       h->s->nr_data, 0);
++			BUG_ON(j >= h->s->nr_data);
++
++			h->s->blocks[j] = buckets.v[i];
++			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
++			__set_bit(j, h->s->blocks_gotten);
++		}
++
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++/* XXX: doesn't obey target: */
++static s64 get_existing_stripe(struct bch_fs *c,
++			       struct ec_stripe_head *head)
++{
++	ec_stripes_heap *h = &c->ec_stripes_heap;
++	struct stripe *m;
++	size_t heap_idx;
++	u64 stripe_idx;
++	s64 ret = -1;
++
++	if (may_create_new_stripe(c))
++		return -1;
++
++	spin_lock(&c->ec_stripes_heap_lock);
++	for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
++		/* No blocks worth reusing, stripe will just be deleted: */
++		if (!h->data[heap_idx].blocks_nonempty)
++			continue;
++
++		stripe_idx = h->data[heap_idx].idx;
++		m = genradix_ptr(&c->stripes, stripe_idx);
++
++		if (m->algorithm	== head->algo &&
++		    m->nr_redundant	== head->redundancy &&
++		    m->sectors		== head->blocksize &&
++		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant) {
++			bch2_stripes_heap_del(c, m, stripe_idx);
++			ret = stripe_idx;
++			break;
++		}
++	}
++	spin_unlock(&c->ec_stripes_heap_lock);
++	return ret;
++}
++
++static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
++						   struct ec_stripe_head *h)
++{
++	unsigned i;
++	s64 idx;
++	int ret;
++
++	idx = get_existing_stripe(c, h);
++	if (idx < 0) {
++		bch_err(c, "failed to find an existing stripe");
++		return -ENOSPC;
++	}
++
++	h->s->have_existing_stripe = true;
++	ret = get_stripe_key(c, idx, &h->s->existing_stripe);
++	if (ret) {
++		bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
++		return ret;
++	}
++
++	if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
++		/*
++		 * this is a problem: we have deleted from the
++		 * stripes heap already
++		 */
++		BUG();
++	}
++
++	BUG_ON(h->s->existing_stripe.size != h->blocksize);
++	BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
++
++	for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
++		if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
++			__set_bit(i, h->s->blocks_gotten);
++			__set_bit(i, h->s->blocks_allocated);
++		}
++
++		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
++	}
++
++	bkey_copy(&h->s->new_stripe.key.k_i,
++			&h->s->existing_stripe.key.k_i);
++
++	return 0;
++}
++
++static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
++							struct ec_stripe_head *h)
++{
++	int ret;
++
++	ret = bch2_disk_reservation_get(c, &h->s->res,
++			h->blocksize,
++			h->s->nr_parity, 0);
++
++	if (ret) {
++		/*
++		 * This means we need to wait for copygc to
++		 * empty out buckets from existing stripes:
++		 */
++		bch_err(c, "failed to reserve stripe");
++	}
++
++	return ret;
++}
++
++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
++					       unsigned target,
++					       unsigned algo,
++					       unsigned redundancy,
++					       bool copygc,
++					       struct closure *cl)
++{
++	struct ec_stripe_head *h;
++	int ret;
++	bool needs_stripe_new;
++
++	h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
++	if (!h) {
++		bch_err(c, "no stripe head");
++		return NULL;
++	}
++
++	needs_stripe_new = !h->s;
++	if (needs_stripe_new) {
++		if (ec_new_stripe_alloc(c, h)) {
++			ret = -ENOMEM;
++			bch_err(c, "failed to allocate new stripe");
++			goto err;
++		}
++
++		if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize))
++			BUG();
++	}
++
++	/*
++	 * Try reserve a new stripe before reusing an
++	 * existing stripe. This will prevent unnecessary
++	 * read amplification during write oriented workloads.
++	 */
++	ret = 0;
++	if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe)
++		ret = __bch2_ec_stripe_head_reserve(c, h);
++	if (ret && needs_stripe_new)
++		ret = __bch2_ec_stripe_head_reuse(c, h);
++	if (ret)
++		goto err;
++
++	if (!h->s->allocated) {
++		ret = new_stripe_alloc_buckets(c, h, cl);
++		if (ret)
++			goto err;
++
++		h->s->allocated = true;
++	}
++
++	return h;
++
++err:
++	bch2_ec_stripe_head_put(c, h);
++	return ERR_PTR(ret);
++}
++
++void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
++{
++	struct ec_stripe_head *h;
++	struct open_bucket *ob;
++	unsigned i;
++
++	mutex_lock(&c->ec_stripe_head_lock);
++	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
++
++		mutex_lock(&h->lock);
++		if (!h->s)
++			goto unlock;
++
++		for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
++			if (!h->s->blocks[i])
++				continue;
++
++			ob = c->open_buckets + h->s->blocks[i];
++			if (ob->dev == ca->dev_idx)
++				goto found;
++		}
++		goto unlock;
++found:
++		h->s->err = -EROFS;
++		ec_stripe_set_pending(c, h);
++unlock:
++		mutex_unlock(&h->lock);
++	}
++	mutex_unlock(&c->ec_stripe_head_lock);
++}
++
++void bch2_stripes_heap_start(struct bch_fs *c)
++{
++	struct genradix_iter iter;
++	struct stripe *m;
++
++	genradix_for_each(&c->stripes, iter, m)
++		if (m->alive)
++			bch2_stripes_heap_insert(c, m, iter.pos);
++}
++
++int bch2_stripes_read(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	const struct bch_stripe *s;
++	struct stripe *m;
++	unsigned i;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		if (k.k->type != KEY_TYPE_stripe)
++			continue;
++
++		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
++		if (ret)
++			break;
++
++		s = bkey_s_c_to_stripe(k).v;
++
++		m = genradix_ptr(&c->stripes, k.k->p.offset);
++		m->alive	= true;
++		m->sectors	= le16_to_cpu(s->sectors);
++		m->algorithm	= s->algorithm;
++		m->nr_blocks	= s->nr_blocks;
++		m->nr_redundant	= s->nr_redundant;
++		m->blocks_nonempty = 0;
++
++		for (i = 0; i < s->nr_blocks; i++)
++			m->blocks_nonempty += !!stripe_blockcount_get(s, i);
++
++		spin_lock(&c->ec_stripes_heap_lock);
++		bch2_stripes_heap_update(c, m, k.k->p.offset);
++		spin_unlock(&c->ec_stripes_heap_lock);
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		bch_err(c, "error reading stripes: %i", ret);
++
++	return ret;
++}
++
++void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	ec_stripes_heap *h = &c->ec_stripes_heap;
++	struct stripe *m;
++	size_t i;
++
++	spin_lock(&c->ec_stripes_heap_lock);
++	for (i = 0; i < min_t(size_t, h->used, 20); i++) {
++		m = genradix_ptr(&c->stripes, h->data[i].idx);
++
++		prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx,
++		       h->data[i].blocks_nonempty,
++		       m->nr_blocks - m->nr_redundant,
++		       m->nr_redundant);
++	}
++	spin_unlock(&c->ec_stripes_heap_lock);
++}
++
++void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	struct ec_stripe_head *h;
++	struct ec_stripe_new *s;
++
++	mutex_lock(&c->ec_stripe_head_lock);
++	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
++		prt_printf(out, "target %u algo %u redundancy %u:\n",
++		       h->target, h->algo, h->redundancy);
++
++		if (h->s)
++			prt_printf(out, "\tpending: blocks %u+%u allocated %u\n",
++			       h->s->nr_data, h->s->nr_parity,
++			       bitmap_weight(h->s->blocks_allocated,
++					     h->s->nr_data));
++	}
++	mutex_unlock(&c->ec_stripe_head_lock);
++
++	mutex_lock(&c->ec_stripe_new_lock);
++	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
++		prt_printf(out, "\tin flight: blocks %u+%u pin %u\n",
++		       s->nr_data, s->nr_parity,
++		       atomic_read(&s->pin));
++	}
++	mutex_unlock(&c->ec_stripe_new_lock);
++}
++
++void bch2_fs_ec_exit(struct bch_fs *c)
++{
++	struct ec_stripe_head *h;
++
++	while (1) {
++		mutex_lock(&c->ec_stripe_head_lock);
++		h = list_first_entry_or_null(&c->ec_stripe_head_list,
++					     struct ec_stripe_head, list);
++		if (h)
++			list_del(&h->list);
++		mutex_unlock(&c->ec_stripe_head_lock);
++		if (!h)
++			break;
++
++		BUG_ON(h->s);
++		kfree(h);
++	}
++
++	BUG_ON(!list_empty(&c->ec_stripe_new_list));
++
++	free_heap(&c->ec_stripes_heap);
++	genradix_free(&c->stripes);
++	bioset_exit(&c->ec_bioset);
++}
++
++void bch2_fs_ec_init_early(struct bch_fs *c)
++{
++	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
++	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
++}
++
++int bch2_fs_ec_init(struct bch_fs *c)
++{
++	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
++			   BIOSET_NEED_BVECS);
++}
+diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
+new file mode 100644
+index 000000000000..a4c13d61af10
+--- /dev/null
++++ b/fs/bcachefs/ec.h
+@@ -0,0 +1,230 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_EC_H
++#define _BCACHEFS_EC_H
++
++#include "ec_types.h"
++#include "buckets_types.h"
++#include "keylist_types.h"
++
++int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
++			int rw, struct printbuf *);
++void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
++			 struct bkey_s_c);
++
++#define bch2_bkey_ops_stripe (struct bkey_ops) {	\
++	.key_invalid	= bch2_stripe_invalid,		\
++	.val_to_text	= bch2_stripe_to_text,		\
++	.swab		= bch2_ptr_swab,		\
++	.trans_trigger	= bch2_trans_mark_stripe,	\
++	.atomic_trigger	= bch2_mark_stripe,		\
++}
++
++static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
++{
++	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
++			    1 << s->csum_granularity_bits);
++}
++
++static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
++					  unsigned dev, unsigned csum_idx)
++{
++	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
++
++	return sizeof(struct bch_stripe) +
++		sizeof(struct bch_extent_ptr) * s->nr_blocks +
++		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
++}
++
++static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
++						unsigned idx)
++{
++	return stripe_csum_offset(s, s->nr_blocks, 0) +
++		sizeof(u16) * idx;
++}
++
++static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
++					     unsigned idx)
++{
++	return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
++}
++
++static inline void stripe_blockcount_set(struct bch_stripe *s,
++					 unsigned idx, unsigned v)
++{
++	__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
++
++	*p = cpu_to_le16(v);
++}
++
++static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
++{
++	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
++			    sizeof(u64));
++}
++
++static inline void *stripe_csum(struct bch_stripe *s,
++				unsigned block, unsigned csum_idx)
++{
++	EBUG_ON(block >= s->nr_blocks);
++	EBUG_ON(csum_idx >= stripe_csums_per_device(s));
++
++	return (void *) s + stripe_csum_offset(s, block, csum_idx);
++}
++
++static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
++				   unsigned block, unsigned csum_idx)
++{
++	struct bch_csum csum = { 0 };
++
++	memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
++	return csum;
++}
++
++static inline void stripe_csum_set(struct bch_stripe *s,
++				   unsigned block, unsigned csum_idx,
++				   struct bch_csum csum)
++{
++	memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
++}
++
++static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
++					     const struct bch_extent_ptr *data_ptr,
++					     unsigned sectors)
++{
++	return  data_ptr->dev    == stripe_ptr->dev &&
++		data_ptr->gen    == stripe_ptr->gen &&
++		data_ptr->offset >= stripe_ptr->offset &&
++		data_ptr->offset  < stripe_ptr->offset + sectors;
++}
++
++static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
++					   struct extent_ptr_decoded p)
++{
++	unsigned nr_data = s->nr_blocks - s->nr_redundant;
++
++	BUG_ON(!p.has_ec);
++
++	if (p.ec.block >= nr_data)
++		return false;
++
++	return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
++					 le16_to_cpu(s->sectors));
++}
++
++static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
++					     struct extent_ptr_decoded p)
++{
++	unsigned nr_data = m->nr_blocks - m->nr_redundant;
++
++	BUG_ON(!p.has_ec);
++
++	if (p.ec.block >= nr_data)
++		return false;
++
++	return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
++					 m->sectors);
++}
++
++struct bch_read_bio;
++
++struct ec_stripe_buf {
++	/* might not be buffering the entire stripe: */
++	unsigned		offset;
++	unsigned		size;
++	unsigned long		valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
++
++	void			*data[BCH_BKEY_PTRS_MAX];
++
++	union {
++		struct bkey_i_stripe	key;
++		u64			pad[255];
++	};
++};
++
++struct ec_stripe_head;
++
++struct ec_stripe_new {
++	struct bch_fs		*c;
++	struct ec_stripe_head	*h;
++	struct mutex		lock;
++	struct list_head	list;
++	struct closure		iodone;
++
++	/* counts in flight writes, stripe is created when pin == 0 */
++	atomic_t		pin;
++
++	int			err;
++
++	u8			nr_data;
++	u8			nr_parity;
++	bool			allocated;
++	bool			pending;
++	bool			have_existing_stripe;
++
++	unsigned long		blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
++	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
++	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
++	struct disk_reservation	res;
++
++	struct keylist		keys;
++	u64			inline_keys[BKEY_U64s * 8];
++
++	struct ec_stripe_buf	new_stripe;
++	struct ec_stripe_buf	existing_stripe;
++};
++
++struct ec_stripe_head {
++	struct list_head	list;
++	struct mutex		lock;
++
++	unsigned		target;
++	unsigned		algo;
++	unsigned		redundancy;
++	bool			copygc;
++
++	struct bch_devs_mask	devs;
++	unsigned		nr_active_devs;
++
++	unsigned		blocksize;
++
++	struct dev_stripe_state	block_stripe;
++	struct dev_stripe_state	parity_stripe;
++
++	struct ec_stripe_new	*s;
++};
++
++int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
++
++void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
++void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
++			     struct bkey *);
++
++void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
++void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
++
++int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
++
++void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
++			unsigned, unsigned, unsigned, bool, struct closure *);
++
++void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
++void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
++void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
++
++void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
++
++void bch2_ec_flush_new_stripes(struct bch_fs *);
++
++void bch2_stripes_heap_start(struct bch_fs *);
++
++int bch2_stripes_read(struct bch_fs *);
++
++void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
++void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
++
++void bch2_fs_ec_exit(struct bch_fs *);
++void bch2_fs_ec_init_early(struct bch_fs *);
++int bch2_fs_ec_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_EC_H */
+diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
+new file mode 100644
+index 000000000000..edd93da663c1
+--- /dev/null
++++ b/fs/bcachefs/ec_types.h
+@@ -0,0 +1,46 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_EC_TYPES_H
++#define _BCACHEFS_EC_TYPES_H
++
++#include <linux/llist.h>
++
++struct bch_replicas_padded {
++	struct bch_replicas_entry	e;
++	u8				pad[BCH_BKEY_PTRS_MAX];
++};
++
++struct stripe {
++	size_t			heap_idx;
++
++	u16			sectors;
++	u8			algorithm;
++
++	u8			nr_blocks;
++	u8			nr_redundant;
++
++	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
++	unsigned		on_heap:1;
++	u8			blocks_nonempty;
++};
++
++struct gc_stripe {
++	u16			sectors;
++
++	u8			nr_blocks;
++	u8			nr_redundant;
++
++	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
++	u16			block_sectors[BCH_BKEY_PTRS_MAX];
++	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];
++
++	struct bch_replicas_padded r;
++};
++
++struct ec_stripe_heap_entry {
++	size_t			idx;
++	unsigned		blocks_nonempty;
++};
++
++typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
++
++#endif /* _BCACHEFS_EC_TYPES_H */
+diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
+new file mode 100644
+index 000000000000..9da8a5973af0
+--- /dev/null
++++ b/fs/bcachefs/errcode.c
+@@ -0,0 +1,51 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "errcode.h"
++
++#include <linux/errname.h>
++
++static const char * const bch2_errcode_strs[] = {
++#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
++	BCH_ERRCODES()
++#undef x
++	NULL
++};
++
++#define BCH_ERR_0	0
++
++static unsigned bch2_errcode_parents[] = {
++#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = BCH_ERR_##class,
++	BCH_ERRCODES()
++#undef x
++};
++
++const char *bch2_err_str(int err)
++{
++	const char *errstr;
++	err = abs(err);
++
++	BUG_ON(err >= BCH_ERR_MAX);
++
++	if (err >= BCH_ERR_START)
++		errstr = bch2_errcode_strs[err - BCH_ERR_START];
++	else if (err)
++		errstr = errname(err);
++	else
++		errstr = "(No error)";
++	return errstr ?: "(Invalid error)";
++}
++
++bool __bch2_err_matches(int err, int class)
++{
++	err	= abs(err);
++	class	= abs(class);
++
++	BUG_ON(err	>= BCH_ERR_MAX);
++	BUG_ON(class	>= BCH_ERR_MAX);
++
++	while (err >= BCH_ERR_START && err != class)
++		err = bch2_errcode_parents[err - BCH_ERR_START];
++
++	return err == class;
++}
+diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
+new file mode 100644
+index 000000000000..95925c8434b3
+--- /dev/null
++++ b/fs/bcachefs/errcode.h
+@@ -0,0 +1,64 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_ERRCODE_H
++#define _BCACHEFS_ERRCODE_H
++
++#define BCH_ERRCODES()							\
++	x(0,			open_buckets_empty)			\
++	x(0,			freelist_empty)				\
++	x(freelist_empty,	no_buckets_found)			\
++	x(0,			insufficient_devices)			\
++	x(0,			transaction_restart)			\
++	x(transaction_restart,	transaction_restart_fault_inject)	\
++	x(transaction_restart,	transaction_restart_relock)		\
++	x(transaction_restart,	transaction_restart_relock_path)	\
++	x(transaction_restart,	transaction_restart_relock_path_intent)	\
++	x(transaction_restart,	transaction_restart_relock_after_fill)	\
++	x(transaction_restart,	transaction_restart_too_many_iters)	\
++	x(transaction_restart,	transaction_restart_lock_node_reused)	\
++	x(transaction_restart,	transaction_restart_fill_relock)	\
++	x(transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
++	x(transaction_restart,	transaction_restart_mem_realloced)	\
++	x(transaction_restart,	transaction_restart_in_traverse_all)	\
++	x(transaction_restart,	transaction_restart_would_deadlock)	\
++	x(transaction_restart,	transaction_restart_would_deadlock_write)\
++	x(transaction_restart,	transaction_restart_upgrade)		\
++	x(transaction_restart,	transaction_restart_key_cache_fill)	\
++	x(transaction_restart,	transaction_restart_key_cache_raced)	\
++	x(transaction_restart,	transaction_restart_key_cache_realloced)\
++	x(transaction_restart,	transaction_restart_journal_preres_get)	\
++	x(transaction_restart,	transaction_restart_nested)		\
++	x(0,			lock_fail_node_reused)			\
++	x(0,			lock_fail_root_changed)			\
++	x(0,			journal_reclaim_would_deadlock)		\
++	x(0,			fsck)					\
++	x(fsck,			fsck_fix)				\
++	x(fsck,			fsck_ignore)				\
++	x(fsck,			fsck_errors_not_fixed)			\
++	x(fsck,			fsck_repair_unimplemented)		\
++	x(fsck,			fsck_repair_impossible)			\
++	x(0,			need_snapshot_cleanup)			\
++	x(0,			need_topology_repair)
++
++enum bch_errcode {
++	BCH_ERR_START		= 2048,
++#define x(class, err) BCH_ERR_##err,
++	BCH_ERRCODES()
++#undef x
++	BCH_ERR_MAX
++};
++
++const char *bch2_err_str(int);
++bool __bch2_err_matches(int, int);
++
++static inline bool _bch2_err_matches(int err, int class)
++{
++	return err && __bch2_err_matches(err, class);
++}
++
++#define bch2_err_matches(_err, _class)			\
++({							\
++	BUILD_BUG_ON(!__builtin_constant_p(_class));	\
++	_bch2_err_matches(_err, _class);		\
++})
++
++#endif /* _BCACHFES_ERRCODE_H */
+diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
+new file mode 100644
+index 000000000000..f6a895b2ceb7
+--- /dev/null
++++ b/fs/bcachefs/error.c
+@@ -0,0 +1,184 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "error.h"
++#include "io.h"
++#include "super.h"
++
++#define FSCK_ERR_RATELIMIT_NR	10
++
++bool bch2_inconsistent_error(struct bch_fs *c)
++{
++	set_bit(BCH_FS_ERROR, &c->flags);
++
++	switch (c->opts.errors) {
++	case BCH_ON_ERROR_continue:
++		return false;
++	case BCH_ON_ERROR_ro:
++		if (bch2_fs_emergency_read_only(c))
++			bch_err(c, "inconsistency detected - emergency read only");
++		return true;
++	case BCH_ON_ERROR_panic:
++		panic(bch2_fmt(c, "panic after error"));
++		return true;
++	default:
++		BUG();
++	}
++}
++
++void bch2_topology_error(struct bch_fs *c)
++{
++	set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
++	if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
++		bch2_inconsistent_error(c);
++}
++
++void bch2_fatal_error(struct bch_fs *c)
++{
++	if (bch2_fs_emergency_read_only(c))
++		bch_err(c, "fatal error - emergency read only");
++}
++
++void bch2_io_error_work(struct work_struct *work)
++{
++	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
++	struct bch_fs *c = ca->fs;
++	bool dev;
++
++	down_write(&c->state_lock);
++	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
++				    BCH_FORCE_IF_DEGRADED);
++	if (dev
++	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
++				  BCH_FORCE_IF_DEGRADED)
++	    : bch2_fs_emergency_read_only(c))
++		bch_err(ca,
++			"too many IO errors, setting %s RO",
++			dev ? "device" : "filesystem");
++	up_write(&c->state_lock);
++}
++
++void bch2_io_error(struct bch_dev *ca)
++{
++	//queue_work(system_long_wq, &ca->io_error_work);
++}
++
++#ifdef __KERNEL__
++#define ask_yn()	false
++#else
++#include "tools-util.h"
++#endif
++
++int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
++{
++	struct fsck_err_state *s = NULL;
++	va_list args;
++	bool fix = false, print = true, suppressing = false;
++	char _buf[sizeof(s->buf)], *buf = _buf;
++
++	if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
++		va_start(args, fmt);
++		vprintk(fmt, args);
++		va_end(args);
++
++		if (c->opts.errors == BCH_ON_ERROR_continue) {
++			bch_err(c, "fixing");
++			return -BCH_ERR_fsck_fix;
++		} else {
++			bch2_inconsistent_error(c);
++			return -BCH_ERR_fsck_errors_not_fixed;
++		}
++	}
++
++	mutex_lock(&c->fsck_error_lock);
++
++	list_for_each_entry(s, &c->fsck_errors, list)
++		if (s->fmt == fmt)
++			goto found;
++
++	s = kzalloc(sizeof(*s), GFP_NOFS);
++	if (!s) {
++		if (!c->fsck_alloc_err)
++			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
++		c->fsck_alloc_err = true;
++		buf = _buf;
++		goto print;
++	}
++
++	INIT_LIST_HEAD(&s->list);
++	s->fmt = fmt;
++found:
++	list_move(&s->list, &c->fsck_errors);
++	s->nr++;
++	if (c->opts.ratelimit_errors &&
++	    !(flags & FSCK_NO_RATELIMIT) &&
++	    s->nr >= FSCK_ERR_RATELIMIT_NR) {
++		if (s->nr == FSCK_ERR_RATELIMIT_NR)
++			suppressing = true;
++		else
++			print = false;
++	}
++	buf		= s->buf;
++print:
++	va_start(args, fmt);
++	vscnprintf(buf, sizeof(_buf), fmt, args);
++	va_end(args);
++
++	if (c->opts.fix_errors == FSCK_OPT_EXIT) {
++		bch_err(c, "%s, exiting", buf);
++	} else if (flags & FSCK_CAN_FIX) {
++		if (c->opts.fix_errors == FSCK_OPT_ASK) {
++			printk(KERN_ERR "%s: fix?", buf);
++			fix = ask_yn();
++		} else if (c->opts.fix_errors == FSCK_OPT_YES ||
++			   (c->opts.nochanges &&
++			    !(flags & FSCK_CAN_IGNORE))) {
++			if (print)
++				bch_err(c, "%s, fixing", buf);
++			fix = true;
++		} else {
++			if (print)
++				bch_err(c, "%s, not fixing", buf);
++			fix = false;
++		}
++	} else if (flags & FSCK_NEED_FSCK) {
++		if (print)
++			bch_err(c, "%s (run fsck to correct)", buf);
++	} else {
++		if (print)
++			bch_err(c, "%s (repair unimplemented)", buf);
++	}
++
++	if (suppressing)
++		bch_err(c, "Ratelimiting new instances of previous error");
++
++	mutex_unlock(&c->fsck_error_lock);
++
++	if (fix) {
++		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
++		return -BCH_ERR_fsck_fix;
++	} else {
++		set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
++		set_bit(BCH_FS_ERROR, &c->flags);
++		return c->opts.fix_errors == FSCK_OPT_EXIT ||
++			!(flags & FSCK_CAN_IGNORE)
++			? -BCH_ERR_fsck_errors_not_fixed
++			: -BCH_ERR_fsck_ignore;
++	}
++}
++
++void bch2_flush_fsck_errs(struct bch_fs *c)
++{
++	struct fsck_err_state *s, *n;
++
++	mutex_lock(&c->fsck_error_lock);
++
++	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
++		if (s->ratelimited)
++			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
++
++		list_del(&s->list);
++		kfree(s);
++	}
++
++	mutex_unlock(&c->fsck_error_lock);
++}
+diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
+new file mode 100644
+index 000000000000..b603d738c549
+--- /dev/null
++++ b/fs/bcachefs/error.h
+@@ -0,0 +1,223 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_ERROR_H
++#define _BCACHEFS_ERROR_H
++
++#include <linux/list.h>
++#include <linux/printk.h>
++
++struct bch_dev;
++struct bch_fs;
++struct work_struct;
++
++/*
++ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
++ * superblock as such
++ */
++
++/* Error messages: */
++
++/*
++ * Inconsistency errors: The on disk data is inconsistent. If these occur during
++ * initial recovery, they don't indicate a bug in the running code - we walk all
++ * the metadata before modifying anything. If they occur at runtime, they
++ * indicate either a bug in the running code or (less likely) data is being
++ * silently corrupted under us.
++ *
++ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
++ * BCH_ON_ERROR_CONTINUE mode
++ */
++
++bool bch2_inconsistent_error(struct bch_fs *);
++
++void bch2_topology_error(struct bch_fs *);
++
++#define bch2_fs_inconsistent(c, ...)					\
++({									\
++	bch_err(c, __VA_ARGS__);					\
++	bch2_inconsistent_error(c);					\
++})
++
++#define bch2_fs_inconsistent_on(cond, c, ...)				\
++({									\
++	bool _ret = unlikely(!!(cond));					\
++									\
++	if (_ret)							\
++		bch2_fs_inconsistent(c, __VA_ARGS__);			\
++	_ret;								\
++})
++
++/*
++ * Later we might want to mark only the particular device inconsistent, not the
++ * entire filesystem:
++ */
++
++#define bch2_dev_inconsistent(ca, ...)					\
++do {									\
++	bch_err(ca, __VA_ARGS__);					\
++	bch2_inconsistent_error((ca)->fs);				\
++} while (0)
++
++#define bch2_dev_inconsistent_on(cond, ca, ...)				\
++({									\
++	bool _ret = unlikely(!!(cond));					\
++									\
++	if (_ret)							\
++		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
++	_ret;								\
++})
++
++/*
++ * When a transaction update discovers or is causing a fs inconsistency, it's
++ * helpful to also dump the pending updates:
++ */
++#define bch2_trans_inconsistent(trans, ...)				\
++({									\
++	bch_err(trans->c, __VA_ARGS__);					\
++	bch2_inconsistent_error(trans->c);				\
++	bch2_dump_trans_updates(trans);					\
++})
++
++#define bch2_trans_inconsistent_on(cond, trans, ...)			\
++({									\
++	bool _ret = unlikely(!!(cond));					\
++									\
++	if (_ret)							\
++		bch2_trans_inconsistent(trans, __VA_ARGS__);		\
++	_ret;								\
++})
++
++/*
++ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
++ * be able to repair:
++ */
++
++enum fsck_err_opts {
++	FSCK_OPT_EXIT,
++	FSCK_OPT_YES,
++	FSCK_OPT_NO,
++	FSCK_OPT_ASK,
++};
++
++struct fsck_err_state {
++	struct list_head	list;
++	const char		*fmt;
++	u64			nr;
++	bool			ratelimited;
++	char			buf[512];
++};
++
++#define FSCK_CAN_FIX		(1 << 0)
++#define FSCK_CAN_IGNORE		(1 << 1)
++#define FSCK_NEED_FSCK		(1 << 2)
++#define FSCK_NO_RATELIMIT	(1 << 3)
++
++__printf(3, 4) __cold
++int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
++void bch2_flush_fsck_errs(struct bch_fs *);
++
++#define __fsck_err(c, _flags, msg, ...)					\
++({									\
++	int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);	\
++									\
++	if (_ret != -BCH_ERR_fsck_fix &&				\
++	    _ret != -BCH_ERR_fsck_ignore) {				\
++		bch_err(c, "Unable to continue, halting");		\
++		ret = _ret;						\
++		goto fsck_err;						\
++	}								\
++									\
++	_ret == -BCH_ERR_fsck_fix;					\
++})
++
++/* These macros return true if error should be fixed: */
++
++/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
++
++#define __fsck_err_on(cond, c, _flags, ...)				\
++	(unlikely(cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
++
++#define need_fsck_err_on(cond, c, ...)					\
++	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
++
++#define need_fsck_err(c, ...)						\
++	__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
++
++#define mustfix_fsck_err(c, ...)					\
++	__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
++
++#define mustfix_fsck_err_on(cond, c, ...)				\
++	__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
++
++#define fsck_err(c, ...)						\
++	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
++
++#define fsck_err_on(cond, c, ...)					\
++	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
++
++/*
++ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
++ * mode - pretty much just due to metadata IO errors:
++ */
++
++void bch2_fatal_error(struct bch_fs *);
++
++#define bch2_fs_fatal_error(c, ...)					\
++do {									\
++	bch_err(c, __VA_ARGS__);					\
++	bch2_fatal_error(c);						\
++} while (0)
++
++#define bch2_fs_fatal_err_on(cond, c, ...)				\
++({									\
++	bool _ret = unlikely(!!(cond));					\
++									\
++	if (_ret)							\
++		bch2_fs_fatal_error(c, __VA_ARGS__);			\
++	_ret;								\
++})
++
++/*
++ * IO errors: either recoverable metadata IO (because we have replicas), or data
++ * IO - we need to log it and print out a message, but we don't (necessarily)
++ * want to shut down the fs:
++ */
++
++void bch2_io_error_work(struct work_struct *);
++
++/* Does the error handling without logging a message */
++void bch2_io_error(struct bch_dev *);
++
++/* Logs message and handles the error: */
++#define bch2_dev_io_error(ca, fmt, ...)					\
++do {									\
++	printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt,		\
++		(ca)->name, ##__VA_ARGS__);				\
++	bch2_io_error(ca);						\
++} while (0)
++
++#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...)		\
++do {									\
++	printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
++		(ca)->name, (_inum), (_offset), ##__VA_ARGS__);		\
++	bch2_io_error(ca);						\
++} while (0)
++
++#define bch2_dev_io_err_on(cond, ca, ...)				\
++({									\
++	bool _ret = (cond);						\
++									\
++	if (_ret)							\
++		bch2_dev_io_error(ca, __VA_ARGS__);			\
++	_ret;								\
++})
++
++#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...)		\
++({									\
++	bool _ret = (cond);						\
++									\
++	if (_ret)							\
++		bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
++	_ret;								\
++})
++
++#endif /* _BCACHEFS_ERROR_H */
+diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
+new file mode 100644
+index 000000000000..2fd5d9672a44
+--- /dev/null
++++ b/fs/bcachefs/extent_update.c
+@@ -0,0 +1,178 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "btree_update.h"
++#include "btree_update_interior.h"
++#include "buckets.h"
++#include "debug.h"
++#include "extents.h"
++#include "extent_update.h"
++
++/*
++ * This counts the number of iterators to the alloc & ec btrees we'll need
++ * inserting/removing this extent:
++ */
++static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	unsigned ret = 0, lru = 0;
++
++	bkey_extent_entry_for_each(ptrs, entry) {
++		switch (__extent_entry_type(entry)) {
++		case BCH_EXTENT_ENTRY_ptr:
++			/* Might also be updating LRU btree */
++			if (entry->ptr.cached)
++				lru++;
++
++			fallthrough;
++		case BCH_EXTENT_ENTRY_stripe_ptr:
++			ret++;
++		}
++	}
++
++	/*
++	 * Updating keys in the alloc btree may also update keys in the
++	 * freespace or discard btrees:
++	 */
++	return lru + ret * 2;
++}
++
++static int count_iters_for_insert(struct btree_trans *trans,
++				  struct bkey_s_c k,
++				  unsigned offset,
++				  struct bpos *end,
++				  unsigned *nr_iters,
++				  unsigned max_iters)
++{
++	int ret = 0, ret2 = 0;
++
++	if (*nr_iters >= max_iters) {
++		*end = bpos_min(*end, k.k->p);
++		ret = 1;
++	}
++
++	switch (k.k->type) {
++	case KEY_TYPE_extent:
++	case KEY_TYPE_reflink_v:
++		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
++
++		if (*nr_iters >= max_iters) {
++			*end = bpos_min(*end, k.k->p);
++			ret = 1;
++		}
++
++		break;
++	case KEY_TYPE_reflink_p: {
++		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
++		u64 idx = le64_to_cpu(p.v->idx);
++		unsigned sectors = bpos_min(*end, p.k->p).offset -
++			bkey_start_offset(p.k);
++		struct btree_iter iter;
++		struct bkey_s_c r_k;
++
++		for_each_btree_key_norestart(trans, iter,
++				   BTREE_ID_reflink, POS(0, idx + offset),
++				   BTREE_ITER_SLOTS, r_k, ret2) {
++			if (bkey_cmp(bkey_start_pos(r_k.k),
++				     POS(0, idx + sectors)) >= 0)
++				break;
++
++			/* extent_update_to_keys(), for the reflink_v update */
++			*nr_iters += 1;
++
++			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
++
++			if (*nr_iters >= max_iters) {
++				struct bpos pos = bkey_start_pos(k.k);
++				pos.offset += min_t(u64, k.k->size,
++						    r_k.k->p.offset - idx);
++
++				*end = bpos_min(*end, pos);
++				ret = 1;
++				break;
++			}
++		}
++		bch2_trans_iter_exit(trans, &iter);
++
++		break;
++	}
++	}
++
++	return ret2 ?: ret;
++}
++
++#define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
++
++int bch2_extent_atomic_end(struct btree_trans *trans,
++			   struct btree_iter *iter,
++			   struct bkey_i *insert,
++			   struct bpos *end)
++{
++	struct btree_iter copy;
++	struct bkey_s_c k;
++	unsigned nr_iters = 0;
++	int ret;
++
++	ret = bch2_btree_iter_traverse(iter);
++	if (ret)
++		return ret;
++
++	*end = insert->k.p;
++
++	/* extent_update_to_keys(): */
++	nr_iters += 1;
++
++	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
++				     &nr_iters, EXTENT_ITERS_MAX / 2);
++	if (ret < 0)
++		return ret;
++
++	bch2_trans_copy_iter(&copy, iter);
++
++	for_each_btree_key_continue_norestart(copy, 0, k, ret) {
++		unsigned offset = 0;
++
++		if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
++			break;
++
++		if (bkey_cmp(bkey_start_pos(&insert->k),
++			     bkey_start_pos(k.k)) > 0)
++			offset = bkey_start_offset(&insert->k) -
++				bkey_start_offset(k.k);
++
++		/* extent_handle_overwrites(): */
++		switch (bch2_extent_overlap(&insert->k, k.k)) {
++		case BCH_EXTENT_OVERLAP_ALL:
++		case BCH_EXTENT_OVERLAP_FRONT:
++			nr_iters += 1;
++			break;
++		case BCH_EXTENT_OVERLAP_BACK:
++		case BCH_EXTENT_OVERLAP_MIDDLE:
++			nr_iters += 2;
++			break;
++		}
++
++		ret = count_iters_for_insert(trans, k, offset, end,
++					&nr_iters, EXTENT_ITERS_MAX);
++		if (ret)
++			break;
++	}
++
++	bch2_trans_iter_exit(trans, &copy);
++	return ret < 0 ? ret : 0;
++}
++
++int bch2_extent_trim_atomic(struct btree_trans *trans,
++			    struct btree_iter *iter,
++			    struct bkey_i *k)
++{
++	struct bpos end;
++	int ret;
++
++	ret = bch2_extent_atomic_end(trans, iter, k, &end);
++	if (ret)
++		return ret;
++
++	bch2_cut_back(end, k);
++	return 0;
++}
+diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
+new file mode 100644
+index 000000000000..6f5cf449361a
+--- /dev/null
++++ b/fs/bcachefs/extent_update.h
+@@ -0,0 +1,12 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_EXTENT_UPDATE_H
++#define _BCACHEFS_EXTENT_UPDATE_H
++
++#include "bcachefs.h"
++
++int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
++			   struct bkey_i *, struct bpos *);
++int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
++			    struct bkey_i *);
++
++#endif /* _BCACHEFS_EXTENT_UPDATE_H */
+diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
+new file mode 100644
+index 000000000000..2ca13014b9c4
+--- /dev/null
++++ b/fs/bcachefs/extents.c
+@@ -0,0 +1,1324 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
++ *
++ * Code for managing the extent btree and dynamically updating the writeback
++ * dirty sector count.
++ */
++
++#include "bcachefs.h"
++#include "bkey_methods.h"
++#include "btree_gc.h"
++#include "btree_io.h"
++#include "btree_iter.h"
++#include "buckets.h"
++#include "checksum.h"
++#include "debug.h"
++#include "disk_groups.h"
++#include "error.h"
++#include "extents.h"
++#include "inode.h"
++#include "journal.h"
++#include "replicas.h"
++#include "super.h"
++#include "super-io.h"
++#include "util.h"
++
++#include <trace/events/bcachefs.h>
++
++static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
++
++static unsigned bch2_crc_field_size_max[] = {
++	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
++	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
++	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
++};
++
++static void bch2_extent_crc_pack(union bch_extent_crc *,
++				 struct bch_extent_crc_unpacked,
++				 enum bch_extent_entry_type);
++
++static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
++						   unsigned dev)
++{
++	struct bch_dev_io_failures *i;
++
++	for (i = f->devs; i < f->devs + f->nr; i++)
++		if (i->dev == dev)
++			return i;
++
++	return NULL;
++}
++
++void bch2_mark_io_failure(struct bch_io_failures *failed,
++			  struct extent_ptr_decoded *p)
++{
++	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
++
++	if (!f) {
++		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
++
++		f = &failed->devs[failed->nr++];
++		f->dev		= p->ptr.dev;
++		f->idx		= p->idx;
++		f->nr_failed	= 1;
++		f->nr_retries	= 0;
++	} else if (p->idx != f->idx) {
++		f->idx		= p->idx;
++		f->nr_failed	= 1;
++		f->nr_retries	= 0;
++	} else {
++		f->nr_failed++;
++	}
++}
++
++/*
++ * returns true if p1 is better than p2:
++ */
++static inline bool ptr_better(struct bch_fs *c,
++			      const struct extent_ptr_decoded p1,
++			      const struct extent_ptr_decoded p2)
++{
++	if (likely(!p1.idx && !p2.idx)) {
++		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
++		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
++
++		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
++		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
++
++		/* Pick at random, biased in favor of the faster device: */
++
++		return bch2_rand_range(l1 + l2) > l1;
++	}
++
++	if (bch2_force_reconstruct_read)
++		return p1.idx > p2.idx;
++
++	return p1.idx < p2.idx;
++}
++
++/*
++ * This picks a non-stale pointer, preferably from a device other than @avoid.
++ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
++ * other devices, it will still pick a pointer from avoid.
++ */
++int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
++			       struct bch_io_failures *failed,
++			       struct extent_ptr_decoded *pick)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++	struct bch_dev_io_failures *f;
++	struct bch_dev *ca;
++	int ret = 0;
++
++	if (k.k->type == KEY_TYPE_error)
++		return -EIO;
++
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++		ca = bch_dev_bkey_exists(c, p.ptr.dev);
++
++		/*
++		 * If there are any dirty pointers it's an error if we can't
++		 * read:
++		 */
++		if (!ret && !p.ptr.cached)
++			ret = -EIO;
++
++		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
++			continue;
++
++		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
++		if (f)
++			p.idx = f->nr_failed < f->nr_retries
++				? f->idx
++				: f->idx + 1;
++
++		if (!p.idx &&
++		    !bch2_dev_is_readable(ca))
++			p.idx++;
++
++		if (bch2_force_reconstruct_read &&
++		    !p.idx && p.has_ec)
++			p.idx++;
++
++		if (p.idx >= (unsigned) p.has_ec + 1)
++			continue;
++
++		if (ret > 0 && !ptr_better(c, p, *pick))
++			continue;
++
++		*pick = p;
++		ret = 1;
++	}
++
++	return ret;
++}
++
++/* KEY_TYPE_btree_ptr: */
++
++int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			   int rw, struct printbuf *err)
++{
++	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
++		prt_printf(err, "value too big (%zu > %u)",
++		       bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
++		return -EINVAL;
++	}
++
++	return bch2_bkey_ptrs_invalid(c, k, rw, err);
++}
++
++void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
++			    struct bkey_s_c k)
++{
++	bch2_bkey_ptrs_to_text(out, c, k);
++}
++
++int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			      int rw, struct printbuf *err)
++{
++	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
++
++	if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) {
++		prt_printf(err, "value too small (%zu <= %zu)",
++		       bkey_val_bytes(k.k), sizeof(*bp.v));
++		return -EINVAL;
++	}
++
++	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
++		prt_printf(err, "value too big (%zu > %zu)",
++		       bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
++		return -EINVAL;
++	}
++
++	if (c->sb.version < bcachefs_metadata_version_snapshot &&
++	    bp.v->min_key.snapshot) {
++		prt_printf(err, "invalid min_key.snapshot (%u != 0)",
++		       bp.v->min_key.snapshot);
++		return -EINVAL;
++	}
++
++	return bch2_bkey_ptrs_invalid(c, k, rw, err);
++}
++
++void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
++			       struct bkey_s_c k)
++{
++	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
++
++	prt_printf(out, "seq %llx written %u min_key %s",
++	       le64_to_cpu(bp.v->seq),
++	       le16_to_cpu(bp.v->sectors_written),
++	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
++
++	bch2_bpos_to_text(out, bp.v->min_key);
++	prt_printf(out, " ");
++	bch2_bkey_ptrs_to_text(out, c, k);
++}
++
++void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
++			      unsigned big_endian, int write,
++			      struct bkey_s k)
++{
++	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
++
++	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
++
++	if (version < bcachefs_metadata_version_inode_btree_change &&
++	    btree_node_type_is_extents(btree_id) &&
++	    bkey_cmp(bp.v->min_key, POS_MIN))
++		bp.v->min_key = write
++			? bpos_nosnap_predecessor(bp.v->min_key)
++			: bpos_nosnap_successor(bp.v->min_key);
++}
++
++/* KEY_TYPE_extent: */
++
++bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
++{
++	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
++	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
++	union bch_extent_entry *en_l;
++	const union bch_extent_entry *en_r;
++	struct extent_ptr_decoded lp, rp;
++	bool use_right_ptr;
++	struct bch_dev *ca;
++
++	en_l = l_ptrs.start;
++	en_r = r_ptrs.start;
++	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
++		if (extent_entry_type(en_l) != extent_entry_type(en_r))
++			return false;
++
++		en_l = extent_entry_next(en_l);
++		en_r = extent_entry_next(en_r);
++	}
++
++	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
++		return false;
++
++	en_l = l_ptrs.start;
++	en_r = r_ptrs.start;
++	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
++	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
++
++	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
++	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
++		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
++		    rp.ptr.offset + rp.crc.offset ||
++		    lp.ptr.dev			!= rp.ptr.dev ||
++		    lp.ptr.gen			!= rp.ptr.gen ||
++		    lp.has_ec			!= rp.has_ec)
++			return false;
++
++		/* Extents may not straddle buckets: */
++		ca = bch_dev_bkey_exists(c, lp.ptr.dev);
++		if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
++			return false;
++
++		if (lp.has_ec			!= rp.has_ec ||
++		    (lp.has_ec &&
++		     (lp.ec.block		!= rp.ec.block ||
++		      lp.ec.redundancy		!= rp.ec.redundancy ||
++		      lp.ec.idx			!= rp.ec.idx)))
++			return false;
++
++		if (lp.crc.compression_type	!= rp.crc.compression_type ||
++		    lp.crc.nonce		!= rp.crc.nonce)
++			return false;
++
++		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
++		    lp.crc.uncompressed_size) {
++			/* can use left extent's crc entry */
++		} else if (lp.crc.live_size <= rp.crc.offset ) {
++			/* can use right extent's crc entry */
++		} else {
++			/* check if checksums can be merged: */
++			if (lp.crc.csum_type		!= rp.crc.csum_type ||
++			    lp.crc.nonce		!= rp.crc.nonce ||
++			    crc_is_compressed(lp.crc) ||
++			    !bch2_checksum_mergeable(lp.crc.csum_type))
++				return false;
++
++			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
++			    rp.crc.offset)
++				return false;
++
++			if (lp.crc.csum_type &&
++			    lp.crc.uncompressed_size +
++			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
++				return false;
++		}
++
++		en_l = extent_entry_next(en_l);
++		en_r = extent_entry_next(en_r);
++	}
++
++	en_l = l_ptrs.start;
++	en_r = r_ptrs.start;
++	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
++		if (extent_entry_is_crc(en_l)) {
++			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
++			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
++
++			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
++			    bch2_crc_field_size_max[extent_entry_type(en_l)])
++				return false;
++		}
++
++		en_l = extent_entry_next(en_l);
++		en_r = extent_entry_next(en_r);
++	}
++
++	use_right_ptr = false;
++	en_l = l_ptrs.start;
++	en_r = r_ptrs.start;
++	while (en_l < l_ptrs.end) {
++		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
++		    use_right_ptr)
++			en_l->ptr = en_r->ptr;
++
++		if (extent_entry_is_crc(en_l)) {
++			struct bch_extent_crc_unpacked crc_l =
++				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
++			struct bch_extent_crc_unpacked crc_r =
++				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
++
++			use_right_ptr = false;
++
++			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
++			    crc_l.uncompressed_size) {
++				/* can use left extent's crc entry */
++			} else if (crc_l.live_size <= crc_r.offset ) {
++				/* can use right extent's crc entry */
++				crc_r.offset -= crc_l.live_size;
++				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
++						     extent_entry_type(en_l));
++				use_right_ptr = true;
++			} else {
++				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
++								 crc_l.csum,
++								 crc_r.csum,
++								 crc_r.uncompressed_size << 9);
++
++				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
++				crc_l.compressed_size	+= crc_r.compressed_size;
++				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
++						     extent_entry_type(en_l));
++			}
++		}
++
++		en_l = extent_entry_next(en_l);
++		en_r = extent_entry_next(en_r);
++	}
++
++	bch2_key_resize(l.k, l.k->size + r.k->size);
++	return true;
++}
++
++/* KEY_TYPE_reservation: */
++
++int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			     int rw, struct printbuf *err)
++{
++	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
++
++	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) {
++		prt_printf(err, "incorrect value size (%zu != %zu)",
++		       bkey_val_bytes(k.k), sizeof(*r.v));
++		return -EINVAL;
++	}
++
++	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
++		prt_printf(err, "invalid nr_replicas (%u)",
++		       r.v->nr_replicas);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
++			      struct bkey_s_c k)
++{
++	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
++
++	prt_printf(out, "generation %u replicas %u",
++	       le32_to_cpu(r.v->generation),
++	       r.v->nr_replicas);
++}
++
++bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
++{
++	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
++	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
++
++	if (l.v->generation != r.v->generation ||
++	    l.v->nr_replicas != r.v->nr_replicas)
++		return false;
++
++	bch2_key_resize(l.k, l.k->size + r.k->size);
++	return true;
++}
++
++/* Extent checksum entries: */
++
++/* returns true if not equal */
++static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
++					 struct bch_extent_crc_unpacked r)
++{
++	return (l.csum_type		!= r.csum_type ||
++		l.compression_type	!= r.compression_type ||
++		l.compressed_size	!= r.compressed_size ||
++		l.uncompressed_size	!= r.uncompressed_size ||
++		l.offset		!= r.offset ||
++		l.live_size		!= r.live_size ||
++		l.nonce			!= r.nonce ||
++		bch2_crc_cmp(l.csum, r.csum));
++}
++
++static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
++				  struct bch_extent_crc_unpacked n)
++{
++	return !crc_is_compressed(u) &&
++		u.csum_type &&
++		u.uncompressed_size > u.live_size &&
++		bch2_csum_type_is_encryption(u.csum_type) ==
++		bch2_csum_type_is_encryption(n.csum_type);
++}
++
++bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
++				 struct bch_extent_crc_unpacked n)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	struct bch_extent_crc_unpacked crc;
++	const union bch_extent_entry *i;
++
++	if (!n.csum_type)
++		return false;
++
++	bkey_for_each_crc(k.k, ptrs, crc, i)
++		if (can_narrow_crc(crc, n))
++			return true;
++
++	return false;
++}
++
++/*
++ * We're writing another replica for this extent, so while we've got the data in
++ * memory we'll be computing a new checksum for the currently live data.
++ *
++ * If there are other replicas we aren't moving, and they are checksummed but
++ * not compressed, we can modify them to point to only the data that is
++ * currently live (so that readers won't have to bounce) while we've got the
++ * checksum we need:
++ */
++bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
++{
++	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
++	struct bch_extent_crc_unpacked u;
++	struct extent_ptr_decoded p;
++	union bch_extent_entry *i;
++	bool ret = false;
++
++	/* Find a checksum entry that covers only live data: */
++	if (!n.csum_type) {
++		bkey_for_each_crc(&k->k, ptrs, u, i)
++			if (!crc_is_compressed(u) &&
++			    u.csum_type &&
++			    u.live_size == u.uncompressed_size) {
++				n = u;
++				goto found;
++			}
++		return false;
++	}
++found:
++	BUG_ON(crc_is_compressed(n));
++	BUG_ON(n.offset);
++	BUG_ON(n.live_size != k->k.size);
++
++restart_narrow_pointers:
++	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
++
++	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
++		if (can_narrow_crc(p.crc, n)) {
++			__bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
++			p.ptr.offset += p.crc.offset;
++			p.crc = n;
++			bch2_extent_ptr_decoded_append(k, &p);
++			ret = true;
++			goto restart_narrow_pointers;
++		}
++
++	return ret;
++}
++
++static void bch2_extent_crc_pack(union bch_extent_crc *dst,
++				 struct bch_extent_crc_unpacked src,
++				 enum bch_extent_entry_type type)
++{
++#define set_common_fields(_dst, _src)					\
++		_dst.type		= 1 << type;			\
++		_dst.csum_type		= _src.csum_type,		\
++		_dst.compression_type	= _src.compression_type,	\
++		_dst._compressed_size	= _src.compressed_size - 1,	\
++		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
++		_dst.offset		= _src.offset
++
++	switch (type) {
++	case BCH_EXTENT_ENTRY_crc32:
++		set_common_fields(dst->crc32, src);
++		dst->crc32.csum	 = *((__le32 *) &src.csum.lo);
++		break;
++	case BCH_EXTENT_ENTRY_crc64:
++		set_common_fields(dst->crc64, src);
++		dst->crc64.nonce	= src.nonce;
++		dst->crc64.csum_lo	= src.csum.lo;
++		dst->crc64.csum_hi	= *((__le16 *) &src.csum.hi);
++		break;
++	case BCH_EXTENT_ENTRY_crc128:
++		set_common_fields(dst->crc128, src);
++		dst->crc128.nonce	= src.nonce;
++		dst->crc128.csum	= src.csum;
++		break;
++	default:
++		BUG();
++	}
++#undef set_common_fields
++}
++
++void bch2_extent_crc_append(struct bkey_i *k,
++			    struct bch_extent_crc_unpacked new)
++{
++	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
++	union bch_extent_crc *crc = (void *) ptrs.end;
++	enum bch_extent_entry_type type;
++
++	if (bch_crc_bytes[new.csum_type]	<= 4 &&
++	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
++	    new.nonce				<= CRC32_NONCE_MAX)
++		type = BCH_EXTENT_ENTRY_crc32;
++	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
++		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
++		   new.nonce			<= CRC64_NONCE_MAX)
++		type = BCH_EXTENT_ENTRY_crc64;
++	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
++		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
++		   new.nonce			<= CRC128_NONCE_MAX)
++		type = BCH_EXTENT_ENTRY_crc128;
++	else
++		BUG();
++
++	bch2_extent_crc_pack(crc, new, type);
++
++	k->k.u64s += extent_entry_u64s(ptrs.end);
++
++	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
++}
++
++/* Generic code for keys with pointers: */
++
++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
++{
++	return bch2_bkey_devs(k).nr;
++}
++
++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
++{
++	return k.k->type == KEY_TYPE_reservation
++		? bkey_s_c_to_reservation(k).v->nr_replicas
++		: bch2_bkey_dirty_devs(k).nr;
++}
++
++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
++{
++	unsigned ret = 0;
++
++	if (k.k->type == KEY_TYPE_reservation) {
++		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
++	} else {
++		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++		const union bch_extent_entry *entry;
++		struct extent_ptr_decoded p;
++
++		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
++			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
++	}
++
++	return ret;
++}
++
++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++	unsigned ret = 0;
++
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
++		if (!p.ptr.cached && crc_is_compressed(p.crc))
++			ret += p.crc.compressed_size;
++
++	return ret;
++}
++
++bool bch2_bkey_is_incompressible(struct bkey_s_c k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct bch_extent_crc_unpacked crc;
++
++	bkey_for_each_crc(k.k, ptrs, crc, entry)
++		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
++			return true;
++	return false;
++}
++
++unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p = { 0 };
++	unsigned replicas = 0;
++
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++		if (p.ptr.cached)
++			continue;
++
++		if (p.has_ec)
++			replicas += p.ec.redundancy;
++
++		replicas++;
++
++	}
++
++	return replicas;
++}
++
++static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
++					   struct extent_ptr_decoded p)
++{
++	unsigned durability = 0;
++	struct bch_dev *ca;
++
++	if (p.ptr.cached)
++		return 0;
++
++	ca = bch_dev_bkey_exists(c, p.ptr.dev);
++
++	if (ca->mi.state != BCH_MEMBER_STATE_failed)
++		durability = max_t(unsigned, durability, ca->mi.durability);
++
++	if (p.has_ec)
++		durability += p.ec.redundancy;
++
++	return durability;
++}
++
++unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++	unsigned durability = 0;
++
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
++		durability += bch2_extent_ptr_durability(c, p);
++
++	return durability;
++}
++
++void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
++{
++	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
++	union bch_extent_entry *next = extent_entry_next(entry);
++
++	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
++	k->k.u64s -= extent_entry_u64s(entry);
++}
++
++void bch2_bkey_append_ptr(struct bkey_i *k,
++			  struct bch_extent_ptr ptr)
++{
++	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
++
++	switch (k->k.type) {
++	case KEY_TYPE_btree_ptr:
++	case KEY_TYPE_btree_ptr_v2:
++	case KEY_TYPE_extent:
++		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
++
++		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
++
++		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
++		       &ptr,
++		       sizeof(ptr));
++		k->u64s++;
++		break;
++	default:
++		BUG();
++	}
++}
++
++static inline void __extent_entry_insert(struct bkey_i *k,
++					 union bch_extent_entry *dst,
++					 union bch_extent_entry *new)
++{
++	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
++
++	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
++			      dst, (u64 *) end - (u64 *) dst);
++	k->k.u64s += extent_entry_u64s(new);
++	memcpy(dst, new, extent_entry_bytes(new));
++}
++
++void bch2_extent_ptr_decoded_append(struct bkey_i *k,
++				    struct extent_ptr_decoded *p)
++{
++	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
++	struct bch_extent_crc_unpacked crc =
++		bch2_extent_crc_unpack(&k->k, NULL);
++	union bch_extent_entry *pos;
++
++	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
++		pos = ptrs.start;
++		goto found;
++	}
++
++	bkey_for_each_crc(&k->k, ptrs, crc, pos)
++		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
++			pos = extent_entry_next(pos);
++			goto found;
++		}
++
++	bch2_extent_crc_append(k, p->crc);
++	pos = bkey_val_end(bkey_i_to_s(k));
++found:
++	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
++	__extent_entry_insert(k, pos, to_entry(&p->ptr));
++
++	if (p->has_ec) {
++		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
++		__extent_entry_insert(k, pos, to_entry(&p->ec));
++	}
++}
++
++static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
++					  union bch_extent_entry *entry)
++{
++	union bch_extent_entry *i = ptrs.start;
++
++	if (i == entry)
++		return NULL;
++
++	while (extent_entry_next(i) != entry)
++		i = extent_entry_next(i);
++	return i;
++}
++
++static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
++{
++	union bch_extent_entry *next = extent_entry_next(entry);
++
++	/* stripes have ptrs, but their layout doesn't work with this code */
++	BUG_ON(k.k->type == KEY_TYPE_stripe);
++
++	memmove_u64s_down(entry, next,
++			  (u64 *) bkey_val_end(k) - (u64 *) next);
++	k.k->u64s -= (u64 *) next - (u64 *) entry;
++}
++
++/*
++ * Returns pointer to the next entry after the one being dropped:
++ */
++static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
++					   struct bch_extent_ptr *ptr)
++{
++	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
++	union bch_extent_entry *entry = to_entry(ptr), *next;
++	union bch_extent_entry *ret = entry;
++	bool drop_crc = true;
++
++	EBUG_ON(ptr < &ptrs.start->ptr ||
++		ptr >= &ptrs.end->ptr);
++	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
++
++	for (next = extent_entry_next(entry);
++	     next != ptrs.end;
++	     next = extent_entry_next(next)) {
++		if (extent_entry_is_crc(next)) {
++			break;
++		} else if (extent_entry_is_ptr(next)) {
++			drop_crc = false;
++			break;
++		}
++	}
++
++	extent_entry_drop(k, entry);
++
++	while ((entry = extent_entry_prev(ptrs, entry))) {
++		if (extent_entry_is_ptr(entry))
++			break;
++
++		if ((extent_entry_is_crc(entry) && drop_crc) ||
++		    extent_entry_is_stripe_ptr(entry)) {
++			ret = (void *) ret - extent_entry_bytes(entry);
++			extent_entry_drop(k, entry);
++		}
++	}
++
++	return ret;
++}
++
++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
++					   struct bch_extent_ptr *ptr)
++{
++	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
++	union bch_extent_entry *ret =
++		__bch2_bkey_drop_ptr(k, ptr);
++
++	/*
++	 * If we deleted all the dirty pointers and there's still cached
++	 * pointers, we could set the cached pointers to dirty if they're not
++	 * stale - but to do that correctly we'd need to grab an open_bucket
++	 * reference so that we don't race with bucket reuse:
++	 */
++	if (have_dirty &&
++	    !bch2_bkey_dirty_devs(k.s_c).nr) {
++		k.k->type = KEY_TYPE_error;
++		set_bkey_val_u64s(k.k, 0);
++		ret = NULL;
++	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
++		k.k->type = KEY_TYPE_deleted;
++		set_bkey_val_u64s(k.k, 0);
++		ret = NULL;
++	}
++
++	return ret;
++}
++
++void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
++{
++	struct bch_extent_ptr *ptr;
++
++	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
++}
++
++void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
++{
++	struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev);
++
++	if (ptr)
++		__bch2_bkey_drop_ptr(k, ptr);
++}
++
++const struct bch_extent_ptr *
++bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr;
++
++	bkey_for_each_ptr(ptrs, ptr)
++		if (ptr->dev == dev)
++			return ptr;
++
++	return NULL;
++}
++
++bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr;
++
++	bkey_for_each_ptr(ptrs, ptr)
++		if (bch2_dev_in_target(c, ptr->dev, target) &&
++		    (!ptr->cached ||
++		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
++			return true;
++
++	return false;
++}
++
++bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
++			   struct bch_extent_ptr m, u64 offset)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
++		if (p.ptr.dev	== m.dev &&
++		    p.ptr.gen	== m.gen &&
++		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
++		    (s64) m.offset  - offset)
++			return true;
++
++	return false;
++}
++
++/*
++ * Returns true if two extents refer to the same data:
++ */
++bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
++{
++	struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
++	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
++	const union bch_extent_entry *entry1, *entry2;
++	struct extent_ptr_decoded p1, p2;
++
++	bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
++		bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
++			if (p1.ptr.dev		== p2.ptr.dev &&
++			    p1.ptr.gen		== p2.ptr.gen &&
++			    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
++			    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
++				return true;
++
++	return false;
++}
++
++bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
++			 struct bkey_s_c k2)
++{
++	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
++	const union bch_extent_entry *entry2;
++	struct extent_ptr_decoded p2;
++
++	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
++		if (p1.ptr.dev		== p2.ptr.dev &&
++		    p1.ptr.gen		== p2.ptr.gen &&
++		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
++		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
++			return true;
++
++	return false;
++}
++
++/*
++ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
++ *
++ * Returns true if @k should be dropped entirely
++ *
++ * For existing keys, only called when btree nodes are being rewritten, not when
++ * they're merely being compacted/resorted in memory.
++ */
++bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
++{
++	struct bch_extent_ptr *ptr;
++
++	bch2_bkey_drop_ptrs(k, ptr,
++		ptr->cached &&
++		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
++
++	return bkey_deleted(k.k);
++}
++
++void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
++			    struct bkey_s_c k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct bch_extent_crc_unpacked crc;
++	const struct bch_extent_ptr *ptr;
++	const struct bch_extent_stripe_ptr *ec;
++	struct bch_dev *ca;
++	bool first = true;
++
++	bkey_extent_entry_for_each(ptrs, entry) {
++		if (!first)
++			prt_printf(out, " ");
++
++		switch (__extent_entry_type(entry)) {
++		case BCH_EXTENT_ENTRY_ptr:
++			ptr = entry_to_ptr(entry);
++			ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
++				? bch_dev_bkey_exists(c, ptr->dev)
++				: NULL;
++
++			if (!ca) {
++				prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
++				       (u64) ptr->offset, ptr->gen,
++				       ptr->cached ? " cached" : "");
++			} else {
++				u32 offset;
++				u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
++
++				prt_printf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev,
++				       b, offset, ptr->gen,
++				       ptr->cached ? " cached" : "");
++
++				if (ca && ptr_stale(ca, ptr))
++					prt_printf(out, " stale");
++			}
++			break;
++		case BCH_EXTENT_ENTRY_crc32:
++		case BCH_EXTENT_ENTRY_crc64:
++		case BCH_EXTENT_ENTRY_crc128:
++			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
++
++			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
++			       crc.compressed_size,
++			       crc.uncompressed_size,
++			       crc.offset, crc.nonce,
++			       bch2_csum_types[crc.csum_type],
++			       bch2_compression_types[crc.compression_type]);
++			break;
++		case BCH_EXTENT_ENTRY_stripe_ptr:
++			ec = &entry->stripe_ptr;
++
++			prt_printf(out, "ec: idx %llu block %u",
++			       (u64) ec->idx, ec->block);
++			break;
++		default:
++			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
++			return;
++		}
++
++		first = false;
++	}
++}
++
++static int extent_ptr_invalid(const struct bch_fs *c,
++			      struct bkey_s_c k,
++			      const struct bch_extent_ptr *ptr,
++			      unsigned size_ondisk,
++			      bool metadata,
++			      struct printbuf *err)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr2;
++	u64 bucket;
++	u32 bucket_offset;
++	struct bch_dev *ca;
++
++	if (!bch2_dev_exists2(c, ptr->dev)) {
++		prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
++		return -EINVAL;
++	}
++
++	ca = bch_dev_bkey_exists(c, ptr->dev);
++	bkey_for_each_ptr(ptrs, ptr2)
++		if (ptr != ptr2 && ptr->dev == ptr2->dev) {
++			prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
++			return -EINVAL;
++		}
++
++	bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
++
++	if (bucket >= ca->mi.nbuckets) {
++		prt_printf(err, "pointer past last bucket (%llu > %llu)",
++		       bucket, ca->mi.nbuckets);
++		return -EINVAL;
++	}
++
++	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
++		prt_printf(err, "pointer before first bucket (%llu < %u)",
++		       bucket, ca->mi.first_bucket);
++		return -EINVAL;
++	}
++
++	if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
++		prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
++		       bucket_offset, size_ondisk, ca->mi.bucket_size);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			   int rw, struct printbuf *err)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct bch_extent_crc_unpacked crc;
++	unsigned size_ondisk = k.k->size;
++	unsigned nonce = UINT_MAX;
++	unsigned nr_ptrs = 0;
++	int ret;
++
++	if (bkey_is_btree_ptr(k.k))
++		size_ondisk = btree_sectors(c);
++
++	bkey_extent_entry_for_each(ptrs, entry) {
++		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
++			prt_printf(err, "invalid extent entry type (got %u, max %u)",
++			       __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
++			return -EINVAL;
++		}
++
++		if (bkey_is_btree_ptr(k.k) &&
++		    !extent_entry_is_ptr(entry)) {
++			prt_printf(err, "has non ptr field");
++			return -EINVAL;
++		}
++
++		switch (extent_entry_type(entry)) {
++		case BCH_EXTENT_ENTRY_ptr:
++			ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
++						 false, err);
++			if (ret)
++				return ret;
++			nr_ptrs++;
++			break;
++		case BCH_EXTENT_ENTRY_crc32:
++		case BCH_EXTENT_ENTRY_crc64:
++		case BCH_EXTENT_ENTRY_crc128:
++			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
++
++			if (crc.offset + crc.live_size >
++			    crc.uncompressed_size) {
++				prt_printf(err, "checksum offset + key size > uncompressed size");
++				return -EINVAL;
++			}
++
++			size_ondisk = crc.compressed_size;
++
++			if (!bch2_checksum_type_valid(c, crc.csum_type)) {
++				prt_printf(err, "invalid checksum type");
++				return -EINVAL;
++			}
++
++			if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
++				prt_printf(err, "invalid compression type");
++				return -EINVAL;
++			}
++
++			if (bch2_csum_type_is_encryption(crc.csum_type)) {
++				if (nonce == UINT_MAX)
++					nonce = crc.offset + crc.nonce;
++				else if (nonce != crc.offset + crc.nonce) {
++					prt_printf(err, "incorrect nonce");
++					return -EINVAL;
++				}
++			}
++			break;
++		case BCH_EXTENT_ENTRY_stripe_ptr:
++			break;
++		}
++	}
++
++	if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
++		prt_str(err, "too many ptrs");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_ptr_swab(struct bkey_s k)
++{
++	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
++	union bch_extent_entry *entry;
++	u64 *d;
++
++	for (d =  (u64 *) ptrs.start;
++	     d != (u64 *) ptrs.end;
++	     d++)
++		*d = swab64(*d);
++
++	for (entry = ptrs.start;
++	     entry < ptrs.end;
++	     entry = extent_entry_next(entry)) {
++		switch (extent_entry_type(entry)) {
++		case BCH_EXTENT_ENTRY_ptr:
++			break;
++		case BCH_EXTENT_ENTRY_crc32:
++			entry->crc32.csum = swab32(entry->crc32.csum);
++			break;
++		case BCH_EXTENT_ENTRY_crc64:
++			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
++			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
++			break;
++		case BCH_EXTENT_ENTRY_crc128:
++			entry->crc128.csum.hi = (__force __le64)
++				swab64((__force u64) entry->crc128.csum.hi);
++			entry->crc128.csum.lo = (__force __le64)
++				swab64((__force u64) entry->crc128.csum.lo);
++			break;
++		case BCH_EXTENT_ENTRY_stripe_ptr:
++			break;
++		}
++	}
++}
++
++/* Generic extent code: */
++
++int bch2_cut_front_s(struct bpos where, struct bkey_s k)
++{
++	unsigned new_val_u64s = bkey_val_u64s(k.k);
++	int val_u64s_delta;
++	u64 sub;
++
++	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
++		return 0;
++
++	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
++
++	sub = where.offset - bkey_start_offset(k.k);
++
++	k.k->size -= sub;
++
++	if (!k.k->size) {
++		k.k->type = KEY_TYPE_deleted;
++		new_val_u64s = 0;
++	}
++
++	switch (k.k->type) {
++	case KEY_TYPE_extent:
++	case KEY_TYPE_reflink_v: {
++		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
++		union bch_extent_entry *entry;
++		bool seen_crc = false;
++
++		bkey_extent_entry_for_each(ptrs, entry) {
++			switch (extent_entry_type(entry)) {
++			case BCH_EXTENT_ENTRY_ptr:
++				if (!seen_crc)
++					entry->ptr.offset += sub;
++				break;
++			case BCH_EXTENT_ENTRY_crc32:
++				entry->crc32.offset += sub;
++				break;
++			case BCH_EXTENT_ENTRY_crc64:
++				entry->crc64.offset += sub;
++				break;
++			case BCH_EXTENT_ENTRY_crc128:
++				entry->crc128.offset += sub;
++				break;
++			case BCH_EXTENT_ENTRY_stripe_ptr:
++				break;
++			}
++
++			if (extent_entry_is_crc(entry))
++				seen_crc = true;
++		}
++
++		break;
++	}
++	case KEY_TYPE_reflink_p: {
++		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
++
++		le64_add_cpu(&p.v->idx, sub);
++		break;
++	}
++	case KEY_TYPE_inline_data:
++	case KEY_TYPE_indirect_inline_data: {
++		void *p = bkey_inline_data_p(k);
++		unsigned bytes = bkey_inline_data_bytes(k.k);
++
++		sub = min_t(u64, sub << 9, bytes);
++
++		memmove(p, p + sub, bytes - sub);
++
++		new_val_u64s -= sub >> 3;
++		break;
++	}
++	}
++
++	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
++	BUG_ON(val_u64s_delta < 0);
++
++	set_bkey_val_u64s(k.k, new_val_u64s);
++	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
++	return -val_u64s_delta;
++}
++
++int bch2_cut_back_s(struct bpos where, struct bkey_s k)
++{
++	unsigned new_val_u64s = bkey_val_u64s(k.k);
++	int val_u64s_delta;
++	u64 len = 0;
++
++	if (bkey_cmp(where, k.k->p) >= 0)
++		return 0;
++
++	EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
++
++	len = where.offset - bkey_start_offset(k.k);
++
++	k.k->p.offset = where.offset;
++	k.k->size = len;
++
++	if (!len) {
++		k.k->type = KEY_TYPE_deleted;
++		new_val_u64s = 0;
++	}
++
++	switch (k.k->type) {
++	case KEY_TYPE_inline_data:
++	case KEY_TYPE_indirect_inline_data:
++		new_val_u64s = (bkey_inline_data_offset(k.k) +
++				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
++		break;
++	}
++
++	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
++	BUG_ON(val_u64s_delta < 0);
++
++	set_bkey_val_u64s(k.k, new_val_u64s);
++	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
++	return -val_u64s_delta;
++}
+diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
+new file mode 100644
+index 000000000000..3c17b81130bb
+--- /dev/null
++++ b/fs/bcachefs/extents.h
+@@ -0,0 +1,685 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_EXTENTS_H
++#define _BCACHEFS_EXTENTS_H
++
++#include "bcachefs.h"
++#include "bkey.h"
++#include "extents_types.h"
++
++struct bch_fs;
++struct btree_trans;
++
++/* extent entries: */
++
++#define extent_entry_last(_e)						\
++	((typeof(&(_e).v->start[0])) bkey_val_end(_e))
++
++#define entry_to_ptr(_entry)						\
++({									\
++	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
++									\
++	__builtin_choose_expr(						\
++		type_is_exact(_entry, const union bch_extent_entry *),	\
++		(const struct bch_extent_ptr *) (_entry),		\
++		(struct bch_extent_ptr *) (_entry));			\
++})
++
++/* downcast, preserves const */
++#define to_entry(_entry)						\
++({									\
++	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
++		     !type_is(_entry, struct bch_extent_ptr *) &&	\
++		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
++									\
++	__builtin_choose_expr(						\
++		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
++		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
++		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
++		(const union bch_extent_entry *) (_entry),		\
++		(union bch_extent_entry *) (_entry));			\
++})
++
++#define extent_entry_next(_entry)					\
++	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
++
++static inline unsigned
++__extent_entry_type(const union bch_extent_entry *e)
++{
++	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
++}
++
++static inline enum bch_extent_entry_type
++extent_entry_type(const union bch_extent_entry *e)
++{
++	int ret = __ffs(e->type);
++
++	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
++
++	return ret;
++}
++
++static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
++{
++	switch (extent_entry_type(entry)) {
++#define x(f, n)						\
++	case BCH_EXTENT_ENTRY_##f:			\
++		return sizeof(struct bch_extent_##f);
++	BCH_EXTENT_ENTRY_TYPES()
++#undef x
++	default:
++		BUG();
++	}
++}
++
++static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
++{
++	return extent_entry_bytes(entry) / sizeof(u64);
++}
++
++static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
++{
++	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
++}
++
++static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
++{
++	return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
++}
++
++static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
++{
++	switch (extent_entry_type(e)) {
++	case BCH_EXTENT_ENTRY_crc32:
++	case BCH_EXTENT_ENTRY_crc64:
++	case BCH_EXTENT_ENTRY_crc128:
++		return true;
++	default:
++		return false;
++	}
++}
++
++union bch_extent_crc {
++	u8				type;
++	struct bch_extent_crc32		crc32;
++	struct bch_extent_crc64		crc64;
++	struct bch_extent_crc128	crc128;
++};
++
++#define __entry_to_crc(_entry)						\
++	__builtin_choose_expr(						\
++		type_is_exact(_entry, const union bch_extent_entry *),	\
++		(const union bch_extent_crc *) (_entry),		\
++		(union bch_extent_crc *) (_entry))
++
++#define entry_to_crc(_entry)						\
++({									\
++	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
++									\
++	__entry_to_crc(_entry);						\
++})
++
++static inline struct bch_extent_crc_unpacked
++bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
++{
++#define common_fields(_crc)						\
++		.csum_type		= _crc.csum_type,		\
++		.compression_type	= _crc.compression_type,	\
++		.compressed_size	= _crc._compressed_size + 1,	\
++		.uncompressed_size	= _crc._uncompressed_size + 1,	\
++		.offset			= _crc.offset,			\
++		.live_size		= k->size
++
++	if (!crc)
++		return (struct bch_extent_crc_unpacked) {
++			.compressed_size	= k->size,
++			.uncompressed_size	= k->size,
++			.live_size		= k->size,
++		};
++
++	switch (extent_entry_type(to_entry(crc))) {
++	case BCH_EXTENT_ENTRY_crc32: {
++		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
++			common_fields(crc->crc32),
++		};
++
++		*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
++
++		memcpy(&ret.csum.lo, &crc->crc32.csum,
++		       sizeof(crc->crc32.csum));
++
++		return ret;
++	}
++	case BCH_EXTENT_ENTRY_crc64: {
++		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
++			common_fields(crc->crc64),
++			.nonce			= crc->crc64.nonce,
++			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
++		};
++
++		*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
++
++		return ret;
++	}
++	case BCH_EXTENT_ENTRY_crc128: {
++		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
++			common_fields(crc->crc128),
++			.nonce			= crc->crc128.nonce,
++			.csum			= crc->crc128.csum,
++		};
++
++		return ret;
++	}
++	default:
++		BUG();
++	}
++#undef common_fields
++}
++
++static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
++{
++	return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
++		crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
++}
++
++/* bkey_ptrs: generically over any key type that has ptrs */
++
++struct bkey_ptrs_c {
++	const union bch_extent_entry	*start;
++	const union bch_extent_entry	*end;
++};
++
++struct bkey_ptrs {
++	union bch_extent_entry	*start;
++	union bch_extent_entry	*end;
++};
++
++static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
++{
++	switch (k.k->type) {
++	case KEY_TYPE_btree_ptr: {
++		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
++		return (struct bkey_ptrs_c) {
++			to_entry(&e.v->start[0]),
++			to_entry(extent_entry_last(e))
++		};
++	}
++	case KEY_TYPE_extent: {
++		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
++		return (struct bkey_ptrs_c) {
++			e.v->start,
++			extent_entry_last(e)
++		};
++	}
++	case KEY_TYPE_stripe: {
++		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
++		return (struct bkey_ptrs_c) {
++			to_entry(&s.v->ptrs[0]),
++			to_entry(&s.v->ptrs[s.v->nr_blocks]),
++		};
++	}
++	case KEY_TYPE_reflink_v: {
++		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
++
++		return (struct bkey_ptrs_c) {
++			r.v->start,
++			bkey_val_end(r),
++		};
++	}
++	case KEY_TYPE_btree_ptr_v2: {
++		struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
++		return (struct bkey_ptrs_c) {
++			to_entry(&e.v->start[0]),
++			to_entry(extent_entry_last(e))
++		};
++	}
++	default:
++		return (struct bkey_ptrs_c) { NULL, NULL };
++	}
++}
++
++static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
++{
++	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
++
++	return (struct bkey_ptrs) {
++		(void *) p.start,
++		(void *) p.end
++	};
++}
++
++#define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
++	for ((_entry) = (_start);					\
++	     (_entry) < (_end);						\
++	     (_entry) = extent_entry_next(_entry))
++
++#define __bkey_ptr_next(_ptr, _end)					\
++({									\
++	typeof(_end) _entry;						\
++									\
++	__bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry)	\
++		if (extent_entry_is_ptr(_entry))			\
++			break;						\
++									\
++	_entry < (_end) ? entry_to_ptr(_entry) : NULL;			\
++})
++
++#define bkey_extent_entry_for_each_from(_p, _entry, _start)		\
++	__bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
++
++#define bkey_extent_entry_for_each(_p, _entry)				\
++	bkey_extent_entry_for_each_from(_p, _entry, _p.start)
++
++#define __bkey_for_each_ptr(_start, _end, _ptr)				\
++	for ((_ptr) = (_start);						\
++	     ((_ptr) = __bkey_ptr_next(_ptr, _end));			\
++	     (_ptr)++)
++
++#define bkey_ptr_next(_p, _ptr)						\
++	__bkey_ptr_next(_ptr, (_p).end)
++
++#define bkey_for_each_ptr(_p, _ptr)					\
++	__bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
++
++#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)			\
++({									\
++	__label__ out;							\
++									\
++	(_ptr).idx	= 0;						\
++	(_ptr).has_ec	= false;					\
++									\
++	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
++		switch (extent_entry_type(_entry)) {			\
++		case BCH_EXTENT_ENTRY_ptr:				\
++			(_ptr).ptr		= _entry->ptr;		\
++			goto out;					\
++		case BCH_EXTENT_ENTRY_crc32:				\
++		case BCH_EXTENT_ENTRY_crc64:				\
++		case BCH_EXTENT_ENTRY_crc128:				\
++			(_ptr).crc = bch2_extent_crc_unpack(_k,		\
++					entry_to_crc(_entry));		\
++			break;						\
++		case BCH_EXTENT_ENTRY_stripe_ptr:			\
++			(_ptr).ec = _entry->stripe_ptr;			\
++			(_ptr).has_ec	= true;				\
++			break;						\
++		}							\
++out:									\
++	_entry < (_end);						\
++})
++
++#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)	\
++	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
++	     (_entry) = _start;						\
++	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
++	     (_entry) = extent_entry_next(_entry))
++
++#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
++	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
++				   _ptr, _entry)
++
++#define bkey_crc_next(_k, _start, _end, _crc, _iter)			\
++({									\
++	__bkey_extent_entry_for_each_from(_iter, _end, _iter)		\
++		if (extent_entry_is_crc(_iter)) {			\
++			(_crc) = bch2_extent_crc_unpack(_k,		\
++						entry_to_crc(_iter));	\
++			break;						\
++		}							\
++									\
++	(_iter) < (_end);						\
++})
++
++#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)		\
++	for ((_crc) = bch2_extent_crc_unpack(_k, NULL),			\
++	     (_iter) = (_start);					\
++	     bkey_crc_next(_k, _start, _end, _crc, _iter);		\
++	     (_iter) = extent_entry_next(_iter))
++
++#define bkey_for_each_crc(_k, _p, _crc, _iter)				\
++	__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
++
++/* Iterate over pointers in KEY_TYPE_extent: */
++
++#define extent_for_each_entry_from(_e, _entry, _start)			\
++	__bkey_extent_entry_for_each_from(_start,			\
++				extent_entry_last(_e),_entry)
++
++#define extent_for_each_entry(_e, _entry)				\
++	extent_for_each_entry_from(_e, _entry, (_e).v->start)
++
++#define extent_ptr_next(_e, _ptr)					\
++	__bkey_ptr_next(_ptr, extent_entry_last(_e))
++
++#define extent_for_each_ptr(_e, _ptr)					\
++	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
++
++#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
++	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
++				   extent_entry_last(_e), _ptr, _entry)
++
++/* utility code common to all keys with pointers: */
++
++void bch2_mark_io_failure(struct bch_io_failures *,
++			  struct extent_ptr_decoded *);
++int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
++			       struct bch_io_failures *,
++			       struct extent_ptr_decoded *);
++
++/* KEY_TYPE_btree_ptr: */
++
++int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
++			    struct bkey_s_c);
++
++int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
++			      int, struct bkey_s);
++
++#define bch2_bkey_ops_btree_ptr (struct bkey_ops) {		\
++	.key_invalid	= bch2_btree_ptr_invalid,		\
++	.val_to_text	= bch2_btree_ptr_to_text,		\
++	.swab		= bch2_ptr_swab,			\
++	.trans_trigger	= bch2_trans_mark_extent,		\
++	.atomic_trigger	= bch2_mark_extent,			\
++}
++
++#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
++	.key_invalid	= bch2_btree_ptr_v2_invalid,		\
++	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
++	.swab		= bch2_ptr_swab,			\
++	.compat		= bch2_btree_ptr_v2_compat,		\
++	.trans_trigger	= bch2_trans_mark_extent,		\
++	.atomic_trigger	= bch2_mark_extent,			\
++}
++
++/* KEY_TYPE_extent: */
++
++bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
++
++#define bch2_bkey_ops_extent (struct bkey_ops) {		\
++	.key_invalid	= bch2_bkey_ptrs_invalid,		\
++	.val_to_text	= bch2_bkey_ptrs_to_text,		\
++	.swab		= bch2_ptr_swab,			\
++	.key_normalize	= bch2_extent_normalize,		\
++	.key_merge	= bch2_extent_merge,			\
++	.trans_trigger	= bch2_trans_mark_extent,		\
++	.atomic_trigger	= bch2_mark_extent,			\
++}
++
++/* KEY_TYPE_reservation: */
++
++int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
++			     int, struct printbuf *);
++void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
++
++#define bch2_bkey_ops_reservation (struct bkey_ops) {		\
++	.key_invalid	= bch2_reservation_invalid,		\
++	.val_to_text	= bch2_reservation_to_text,		\
++	.key_merge	= bch2_reservation_merge,		\
++	.trans_trigger	= bch2_trans_mark_reservation,		\
++	.atomic_trigger	= bch2_mark_reservation,		\
++}
++
++/* Extent checksum entries: */
++
++bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
++				 struct bch_extent_crc_unpacked);
++bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
++void bch2_extent_crc_append(struct bkey_i *,
++			    struct bch_extent_crc_unpacked);
++
++/* Generic code for keys with pointers: */
++
++static inline bool bkey_is_btree_ptr(const struct bkey *k)
++{
++	switch (k->type) {
++	case KEY_TYPE_btree_ptr:
++	case KEY_TYPE_btree_ptr_v2:
++		return true;
++	default:
++		return false;
++	}
++}
++
++static inline bool bkey_extent_is_direct_data(const struct bkey *k)
++{
++	switch (k->type) {
++	case KEY_TYPE_btree_ptr:
++	case KEY_TYPE_btree_ptr_v2:
++	case KEY_TYPE_extent:
++	case KEY_TYPE_reflink_v:
++		return true;
++	default:
++		return false;
++	}
++}
++
++static inline bool bkey_extent_is_inline_data(const struct bkey *k)
++{
++	return  k->type == KEY_TYPE_inline_data ||
++		k->type == KEY_TYPE_indirect_inline_data;
++}
++
++static inline unsigned bkey_inline_data_offset(const struct bkey *k)
++{
++	switch (k->type) {
++	case KEY_TYPE_inline_data:
++		return sizeof(struct bch_inline_data);
++	case KEY_TYPE_indirect_inline_data:
++		return sizeof(struct bch_indirect_inline_data);
++	default:
++		BUG();
++	}
++}
++
++static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
++{
++	return bkey_val_bytes(k) - bkey_inline_data_offset(k);
++}
++
++#define bkey_inline_data_p(_k)	(((void *) (_k).v) + bkey_inline_data_offset((_k).k))
++
++static inline bool bkey_extent_is_data(const struct bkey *k)
++{
++	return  bkey_extent_is_direct_data(k) ||
++		bkey_extent_is_inline_data(k) ||
++		k->type == KEY_TYPE_reflink_p;
++}
++
++/*
++ * Should extent be counted under inode->i_sectors?
++ */
++static inline bool bkey_extent_is_allocation(const struct bkey *k)
++{
++	switch (k->type) {
++	case KEY_TYPE_extent:
++	case KEY_TYPE_reservation:
++	case KEY_TYPE_reflink_p:
++	case KEY_TYPE_reflink_v:
++	case KEY_TYPE_inline_data:
++	case KEY_TYPE_indirect_inline_data:
++		return true;
++	default:
++		return false;
++	}
++}
++
++static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
++{
++	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
++	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr;
++
++	bkey_for_each_ptr(p, ptr)
++		ret.devs[ret.nr++] = ptr->dev;
++
++	return ret;
++}
++
++static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
++{
++	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
++	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr;
++
++	bkey_for_each_ptr(p, ptr)
++		if (!ptr->cached)
++			ret.devs[ret.nr++] = ptr->dev;
++
++	return ret;
++}
++
++static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
++{
++	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
++	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr;
++
++	bkey_for_each_ptr(p, ptr)
++		if (ptr->cached)
++			ret.devs[ret.nr++] = ptr->dev;
++
++	return ret;
++}
++
++static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
++{
++	switch (k.k->type) {
++	case KEY_TYPE_btree_ptr:
++	case KEY_TYPE_btree_ptr_v2:
++		return BCH_DATA_btree;
++	case KEY_TYPE_extent:
++	case KEY_TYPE_reflink_v:
++		return BCH_DATA_user;
++	case KEY_TYPE_stripe: {
++		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
++
++		BUG_ON(ptr < s.v->ptrs ||
++		       ptr >= s.v->ptrs + s.v->nr_blocks);
++
++		return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
++			? BCH_DATA_parity
++			: BCH_DATA_user;
++	}
++	default:
++		BUG();
++	}
++}
++
++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
++bool bch2_bkey_is_incompressible(struct bkey_s_c);
++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
++
++unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
++unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
++
++void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
++void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
++void bch2_extent_ptr_decoded_append(struct bkey_i *,
++				    struct extent_ptr_decoded *);
++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
++					   struct bch_extent_ptr *);
++
++#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)				\
++do {									\
++	struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k);			\
++									\
++	_ptr = &_ptrs.start->ptr;					\
++									\
++	while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) {			\
++		if (_cond) {						\
++			_ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr);	\
++			_ptrs = bch2_bkey_ptrs(_k);			\
++			continue;					\
++		}							\
++									\
++		(_ptr)++;						\
++	}								\
++} while (0)
++
++void bch2_bkey_drop_device(struct bkey_s, unsigned);
++void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
++const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
++bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
++
++bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
++			   struct bch_extent_ptr, u64);
++bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
++bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
++
++bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
++void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
++			    struct bkey_s_c);
++int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
++			   int, struct printbuf *);
++
++void bch2_ptr_swab(struct bkey_s);
++
++/* Generic extent code: */
++
++enum bch_extent_overlap {
++	BCH_EXTENT_OVERLAP_ALL		= 0,
++	BCH_EXTENT_OVERLAP_BACK		= 1,
++	BCH_EXTENT_OVERLAP_FRONT	= 2,
++	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
++};
++
++/* Returns how k overlaps with m */
++static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
++							  const struct bkey *m)
++{
++	int cmp1 = bkey_cmp(k->p, m->p) < 0;
++	int cmp2 = bkey_cmp(bkey_start_pos(k),
++			    bkey_start_pos(m)) > 0;
++
++	return (cmp1 << 1) + cmp2;
++}
++
++int bch2_cut_front_s(struct bpos, struct bkey_s);
++int bch2_cut_back_s(struct bpos, struct bkey_s);
++
++static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
++{
++	bch2_cut_front_s(where, bkey_i_to_s(k));
++}
++
++static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
++{
++	bch2_cut_back_s(where, bkey_i_to_s(k));
++}
++
++/**
++ * bch_key_resize - adjust size of @k
++ *
++ * bkey_start_offset(k) will be preserved, modifies where the extent ends
++ */
++static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
++{
++	k->p.offset -= k->size;
++	k->p.offset += new_size;
++	k->size = new_size;
++}
++
++/*
++ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
++ * extent_merge_inline() - we're modifying keys in place that are packed. To do
++ * that we have to unpack the key, modify the unpacked key - then this
++ * copies/repacks the unpacked to the original as necessary.
++ */
++static inline void extent_save(struct btree *b, struct bkey_packed *dst,
++			       struct bkey *src)
++{
++	struct bkey_format *f = &b->format;
++	struct bkey_i *dst_unpacked;
++
++	if ((dst_unpacked = packed_to_bkey(dst)))
++		dst_unpacked->k = *src;
++	else
++		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
++}
++
++#endif /* _BCACHEFS_EXTENTS_H */
+diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
+new file mode 100644
+index 000000000000..43d6c341ecca
+--- /dev/null
++++ b/fs/bcachefs/extents_types.h
+@@ -0,0 +1,40 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_EXTENTS_TYPES_H
++#define _BCACHEFS_EXTENTS_TYPES_H
++
++#include "bcachefs_format.h"
++
++struct bch_extent_crc_unpacked {
++	u32			compressed_size;
++	u32			uncompressed_size;
++	u32			live_size;
++
++	u8			csum_type;
++	u8			compression_type;
++
++	u16			offset;
++
++	u16			nonce;
++
++	struct bch_csum		csum;
++};
++
++struct extent_ptr_decoded {
++	unsigned			idx;
++	bool				has_ec;
++	struct bch_extent_crc_unpacked	crc;
++	struct bch_extent_ptr		ptr;
++	struct bch_extent_stripe_ptr	ec;
++};
++
++struct bch_io_failures {
++	u8			nr;
++	struct bch_dev_io_failures {
++		u8		dev;
++		u8		idx;
++		u8		nr_failed;
++		u8		nr_retries;
++	}			devs[BCH_REPLICAS_MAX];
++};
++
++#endif /* _BCACHEFS_EXTENTS_TYPES_H */
+diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
+new file mode 100644
+index 000000000000..05429c9631cd
+--- /dev/null
++++ b/fs/bcachefs/eytzinger.h
+@@ -0,0 +1,281 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _EYTZINGER_H
++#define _EYTZINGER_H
++
++#include <linux/bitops.h>
++#include <linux/log2.h>
++
++#include "util.h"
++
++/*
++ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
++ * array
++ */
++
++/*
++ * One based indexing version:
++ *
++ * With one based indexing each level of the tree starts at a power of two -
++ * good for cacheline alignment:
++ */
++
++static inline unsigned eytzinger1_child(unsigned i, unsigned child)
++{
++	EBUG_ON(child > 1);
++
++	return (i << 1) + child;
++}
++
++static inline unsigned eytzinger1_left_child(unsigned i)
++{
++	return eytzinger1_child(i, 0);
++}
++
++static inline unsigned eytzinger1_right_child(unsigned i)
++{
++	return eytzinger1_child(i, 1);
++}
++
++static inline unsigned eytzinger1_first(unsigned size)
++{
++	return rounddown_pow_of_two(size);
++}
++
++static inline unsigned eytzinger1_last(unsigned size)
++{
++	return rounddown_pow_of_two(size + 1) - 1;
++}
++
++/*
++ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
++ *
++ * eytzinger1_next(0) == eytzinger1_first())
++ * eytzinger1_prev(0) == eytzinger1_last())
++ *
++ * eytzinger1_prev(eytzinger1_first()) == 0
++ * eytzinger1_next(eytzinger1_last()) == 0
++ */
++
++static inline unsigned eytzinger1_next(unsigned i, unsigned size)
++{
++	EBUG_ON(i > size);
++
++	if (eytzinger1_right_child(i) <= size) {
++		i = eytzinger1_right_child(i);
++
++		i <<= __fls(size + 1) - __fls(i);
++		i >>= i > size;
++	} else {
++		i >>= ffz(i) + 1;
++	}
++
++	return i;
++}
++
++static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
++{
++	EBUG_ON(i > size);
++
++	if (eytzinger1_left_child(i) <= size) {
++		i = eytzinger1_left_child(i) + 1;
++
++		i <<= __fls(size + 1) - __fls(i);
++		i -= 1;
++		i >>= i > size;
++	} else {
++		i >>= __ffs(i) + 1;
++	}
++
++	return i;
++}
++
++static inline unsigned eytzinger1_extra(unsigned size)
++{
++	return (size + 1 - rounddown_pow_of_two(size)) << 1;
++}
++
++static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
++					      unsigned extra)
++{
++	unsigned b = __fls(i);
++	unsigned shift = __fls(size) - b;
++	int s;
++
++	EBUG_ON(!i || i > size);
++
++	i  ^= 1U << b;
++	i <<= 1;
++	i  |= 1;
++	i <<= shift;
++
++	/*
++	 * sign bit trick:
++	 *
++	 * if (i > extra)
++	 *	i -= (i - extra) >> 1;
++	 */
++	s = extra - i;
++	i += (s >> 1) & (s >> 31);
++
++	return i;
++}
++
++static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
++					       unsigned extra)
++{
++	unsigned shift;
++	int s;
++
++	EBUG_ON(!i || i > size);
++
++	/*
++	 * sign bit trick:
++	 *
++	 * if (i > extra)
++	 *	i += i - extra;
++	 */
++	s = extra - i;
++	i -= s & (s >> 31);
++
++	shift = __ffs(i);
++
++	i >>= shift + 1;
++	i  |= 1U << (__fls(size) - shift);
++
++	return i;
++}
++
++static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
++{
++	return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
++}
++
++static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
++{
++	return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
++}
++
++#define eytzinger1_for_each(_i, _size)			\
++	for ((_i) = eytzinger1_first((_size));		\
++	     (_i) != 0;					\
++	     (_i) = eytzinger1_next((_i), (_size)))
++
++/* Zero based indexing version: */
++
++static inline unsigned eytzinger0_child(unsigned i, unsigned child)
++{
++	EBUG_ON(child > 1);
++
++	return (i << 1) + 1 + child;
++}
++
++static inline unsigned eytzinger0_left_child(unsigned i)
++{
++	return eytzinger0_child(i, 0);
++}
++
++static inline unsigned eytzinger0_right_child(unsigned i)
++{
++	return eytzinger0_child(i, 1);
++}
++
++static inline unsigned eytzinger0_first(unsigned size)
++{
++	return eytzinger1_first(size) - 1;
++}
++
++static inline unsigned eytzinger0_last(unsigned size)
++{
++	return eytzinger1_last(size) - 1;
++}
++
++static inline unsigned eytzinger0_next(unsigned i, unsigned size)
++{
++	return eytzinger1_next(i + 1, size) - 1;
++}
++
++static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
++{
++	return eytzinger1_prev(i + 1, size) - 1;
++}
++
++static inline unsigned eytzinger0_extra(unsigned size)
++{
++	return eytzinger1_extra(size);
++}
++
++static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
++					       unsigned extra)
++{
++	return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
++}
++
++static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
++					       unsigned extra)
++{
++	return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
++}
++
++static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
++{
++	return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
++}
++
++static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
++{
++	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
++}
++
++#define eytzinger0_for_each(_i, _size)			\
++	for ((_i) = eytzinger0_first((_size));		\
++	     (_i) != -1;				\
++	     (_i) = eytzinger0_next((_i), (_size)))
++
++typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
++
++/* return greatest node <= @search, or -1 if not found */
++static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
++					 eytzinger_cmp_fn cmp, const void *search)
++{
++	unsigned i, n = 0;
++
++	if (!nr)
++		return -1;
++
++	do {
++		i = n;
++		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
++	} while (n < nr);
++
++	if (n & 1) {
++		/* @i was greater than @search, return previous node: */
++
++		if (i == eytzinger0_first(nr))
++			return -1;
++
++		return eytzinger0_prev(i, nr);
++	} else {
++		return i;
++	}
++}
++
++#define eytzinger0_find(base, nr, size, _cmp, search)			\
++({									\
++	void *_base	= (base);					\
++	void *_search	= (search);					\
++	size_t _nr	= (nr);						\
++	size_t _size	= (size);					\
++	size_t _i	= 0;						\
++	int _res;							\
++									\
++	while (_i < _nr &&						\
++	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
++		_i = eytzinger0_child(_i, _res > 0);			\
++	_i;								\
++})
++
++void eytzinger0_sort(void *, size_t, size_t,
++		    int (*cmp_func)(const void *, const void *, size_t),
++		    void (*swap_func)(void *, void *, size_t));
++
++#endif /* _EYTZINGER_H */
+diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
+new file mode 100644
+index 000000000000..cdb272708a4b
+--- /dev/null
++++ b/fs/bcachefs/fifo.h
+@@ -0,0 +1,127 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FIFO_H
++#define _BCACHEFS_FIFO_H
++
++#include "util.h"
++
++#define FIFO(type)							\
++struct {								\
++	size_t front, back, size, mask;					\
++	type *data;							\
++}
++
++#define DECLARE_FIFO(type, name)	FIFO(type) name
++
++#define fifo_buf_size(fifo)						\
++	((fifo)->size							\
++	 ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])	\
++	 : 0)
++
++#define init_fifo(fifo, _size, _gfp)					\
++({									\
++	(fifo)->front	= (fifo)->back = 0;				\
++	(fifo)->size	= (_size);					\
++	(fifo)->mask	= (fifo)->size					\
++		? roundup_pow_of_two((fifo)->size) - 1			\
++		: 0;							\
++	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
++})
++
++#define free_fifo(fifo)							\
++do {									\
++	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
++	(fifo)->data = NULL;						\
++} while (0)
++
++#define fifo_swap(l, r)							\
++do {									\
++	swap((l)->front, (r)->front);					\
++	swap((l)->back, (r)->back);					\
++	swap((l)->size, (r)->size);					\
++	swap((l)->mask, (r)->mask);					\
++	swap((l)->data, (r)->data);					\
++} while (0)
++
++#define fifo_move(dest, src)						\
++do {									\
++	typeof(*((dest)->data)) _t;					\
++	while (!fifo_full(dest) &&					\
++	       fifo_pop(src, _t))					\
++		fifo_push(dest, _t);					\
++} while (0)
++
++#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
++#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
++
++#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
++#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
++
++#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
++#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
++
++#define fifo_entry_idx_abs(fifo, p)					\
++	((((p) >= &fifo_peek_front(fifo)				\
++	   ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +		\
++	   (((p) - (fifo)->data)))
++
++#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
++#define fifo_idx_entry(fifo, i)	(fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
++
++#define fifo_push_back_ref(f)						\
++	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
++
++#define fifo_push_front_ref(f)						\
++	(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
++
++#define fifo_push_back(fifo, new)					\
++({									\
++	typeof((fifo)->data) _r = fifo_push_back_ref(fifo);		\
++	if (_r)								\
++		*_r = (new);						\
++	_r != NULL;							\
++})
++
++#define fifo_push_front(fifo, new)					\
++({									\
++	typeof((fifo)->data) _r = fifo_push_front_ref(fifo);		\
++	if (_r)								\
++		*_r = (new);						\
++	_r != NULL;							\
++})
++
++#define fifo_pop_front(fifo, i)						\
++({									\
++	bool _r = !fifo_empty((fifo));					\
++	if (_r)								\
++		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
++	_r;								\
++})
++
++#define fifo_pop_back(fifo, i)						\
++({									\
++	bool _r = !fifo_empty((fifo));					\
++	if (_r)								\
++		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask];	\
++	_r;								\
++})
++
++#define fifo_push_ref(fifo)	fifo_push_back_ref(fifo)
++#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
++#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
++#define fifo_peek(fifo)		fifo_peek_front(fifo)
++
++#define fifo_for_each_entry(_entry, _fifo, _iter)			\
++	for (typecheck(typeof((_fifo)->front), _iter),			\
++	     (_iter) = (_fifo)->front;					\
++	     ((_iter != (_fifo)->back) &&				\
++	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
++	     (_iter)++)
++
++#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
++	for (typecheck(typeof((_fifo)->front), _iter),			\
++	     (_iter) = (_fifo)->front;					\
++	     ((_iter != (_fifo)->back) &&				\
++	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
++	     (_iter)++)
++
++#endif /* _BCACHEFS_FIFO_H */
+diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
+new file mode 100644
+index 000000000000..53ffc684223c
+--- /dev/null
++++ b/fs/bcachefs/fs-common.c
+@@ -0,0 +1,496 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "acl.h"
++#include "btree_update.h"
++#include "dirent.h"
++#include "fs-common.h"
++#include "inode.h"
++#include "subvolume.h"
++#include "xattr.h"
++
++#include <linux/posix_acl.h>
++
++static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
++{
++	return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
++}
++
++int bch2_create_trans(struct btree_trans *trans,
++		      subvol_inum dir,
++		      struct bch_inode_unpacked *dir_u,
++		      struct bch_inode_unpacked *new_inode,
++		      const struct qstr *name,
++		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
++		      struct posix_acl *default_acl,
++		      struct posix_acl *acl,
++		      subvol_inum snapshot_src,
++		      unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter dir_iter = { NULL };
++	struct btree_iter inode_iter = { NULL };
++	subvol_inum new_inum = dir;
++	u64 now = bch2_current_time(c);
++	u64 cpu = raw_smp_processor_id();
++	u64 dir_target;
++	u32 snapshot;
++	unsigned dir_type = mode_to_type(mode);
++	int ret;
++
++	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
++	if (ret)
++		goto err;
++
++	if (!(flags & BCH_CREATE_SNAPSHOT)) {
++		/* Normal create path - allocate a new inode: */
++		bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
++
++		if (flags & BCH_CREATE_TMPFILE)
++			new_inode->bi_flags |= BCH_INODE_UNLINKED;
++
++		ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
++		if (ret)
++			goto err;
++
++		snapshot_src = (subvol_inum) { 0 };
++	} else {
++		/*
++		 * Creating a snapshot - we're not allocating a new inode, but
++		 * we do have to lookup the root inode of the subvolume we're
++		 * snapshotting and update it (in the new snapshot):
++		 */
++
++		if (!snapshot_src.inum) {
++			/* Inode wasn't specified, just snapshot: */
++			struct bch_subvolume s;
++
++			ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
++						 BTREE_ITER_CACHED, &s);
++			if (ret)
++				goto err;
++
++			snapshot_src.inum = le64_to_cpu(s.inode);
++		}
++
++		ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
++				      BTREE_ITER_INTENT);
++		if (ret)
++			goto err;
++
++		if (new_inode->bi_subvol != snapshot_src.subvol) {
++			/* Not a subvolume root: */
++			ret = -EINVAL;
++			goto err;
++		}
++
++		/*
++		 * If we're not root, we have to own the subvolume being
++		 * snapshotted:
++		 */
++		if (uid && new_inode->bi_uid != uid) {
++			ret = -EPERM;
++			goto err;
++		}
++
++		flags |= BCH_CREATE_SUBVOL;
++	}
++
++	new_inum.inum	= new_inode->bi_inum;
++	dir_target	= new_inode->bi_inum;
++
++	if (flags & BCH_CREATE_SUBVOL) {
++		u32 new_subvol, dir_snapshot;
++
++		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
++					    snapshot_src.subvol,
++					    &new_subvol, &snapshot,
++					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
++		if (ret)
++			goto err;
++
++		new_inode->bi_parent_subvol	= dir.subvol;
++		new_inode->bi_subvol		= new_subvol;
++		new_inum.subvol			= new_subvol;
++		dir_target			= new_subvol;
++		dir_type			= DT_SUBVOL;
++
++		ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
++		if (ret)
++			goto err;
++
++		bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
++		ret = bch2_btree_iter_traverse(&dir_iter);
++		if (ret)
++			goto err;
++	}
++
++	if (!(flags & BCH_CREATE_SNAPSHOT)) {
++		if (default_acl) {
++			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
++						 default_acl, ACL_TYPE_DEFAULT);
++			if (ret)
++				goto err;
++		}
++
++		if (acl) {
++			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
++						 acl, ACL_TYPE_ACCESS);
++			if (ret)
++				goto err;
++		}
++	}
++
++	if (!(flags & BCH_CREATE_TMPFILE)) {
++		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
++		u64 dir_offset;
++
++		if (is_subdir_for_nlink(new_inode))
++			dir_u->bi_nlink++;
++		dir_u->bi_mtime = dir_u->bi_ctime = now;
++
++		ret = bch2_inode_write(trans, &dir_iter, dir_u);
++		if (ret)
++			goto err;
++
++		ret = bch2_dirent_create(trans, dir, &dir_hash,
++					 dir_type,
++					 name,
++					 dir_target,
++					 &dir_offset,
++					 BCH_HASH_SET_MUST_CREATE);
++		if (ret)
++			goto err;
++
++		if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
++			new_inode->bi_dir		= dir_u->bi_inum;
++			new_inode->bi_dir_offset	= dir_offset;
++		}
++	}
++
++	inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
++	bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
++
++	ret   = bch2_btree_iter_traverse(&inode_iter) ?:
++		bch2_inode_write(trans, &inode_iter, new_inode);
++err:
++	bch2_trans_iter_exit(trans, &inode_iter);
++	bch2_trans_iter_exit(trans, &dir_iter);
++	return ret;
++}
++
++int bch2_link_trans(struct btree_trans *trans,
++		    subvol_inum dir,  struct bch_inode_unpacked *dir_u,
++		    subvol_inum inum, struct bch_inode_unpacked *inode_u,
++		    const struct qstr *name)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter dir_iter = { NULL };
++	struct btree_iter inode_iter = { NULL };
++	struct bch_hash_info dir_hash;
++	u64 now = bch2_current_time(c);
++	u64 dir_offset = 0;
++	int ret;
++
++	if (dir.subvol != inum.subvol)
++		return -EXDEV;
++
++	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
++	if (ret)
++		goto err;
++
++	inode_u->bi_ctime = now;
++	ret = bch2_inode_nlink_inc(inode_u);
++	if (ret)
++		return ret;
++
++	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
++	if (ret)
++		goto err;
++
++	dir_u->bi_mtime = dir_u->bi_ctime = now;
++
++	dir_hash = bch2_hash_info_init(c, dir_u);
++
++	ret = bch2_dirent_create(trans, dir, &dir_hash,
++				 mode_to_type(inode_u->bi_mode),
++				 name, inum.inum, &dir_offset,
++				 BCH_HASH_SET_MUST_CREATE);
++	if (ret)
++		goto err;
++
++	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
++		inode_u->bi_dir		= dir.inum;
++		inode_u->bi_dir_offset	= dir_offset;
++	}
++
++	ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
++		bch2_inode_write(trans, &inode_iter, inode_u);
++err:
++	bch2_trans_iter_exit(trans, &dir_iter);
++	bch2_trans_iter_exit(trans, &inode_iter);
++	return ret;
++}
++
++int bch2_unlink_trans(struct btree_trans *trans,
++		      subvol_inum dir,
++		      struct bch_inode_unpacked *dir_u,
++		      struct bch_inode_unpacked *inode_u,
++		      const struct qstr *name,
++		      bool deleting_snapshot)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter dir_iter = { NULL };
++	struct btree_iter dirent_iter = { NULL };
++	struct btree_iter inode_iter = { NULL };
++	struct bch_hash_info dir_hash;
++	subvol_inum inum;
++	u64 now = bch2_current_time(c);
++	struct bkey_s_c k;
++	int ret;
++
++	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
++	if (ret)
++		goto err;
++
++	dir_hash = bch2_hash_info_init(c, dir_u);
++
++	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
++					 name, &inum, BTREE_ITER_INTENT);
++	if (ret)
++		goto err;
++
++	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
++			      BTREE_ITER_INTENT);
++	if (ret)
++		goto err;
++
++	if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
++		ret = bch2_empty_dir_trans(trans, inum);
++		if (ret)
++			goto err;
++	}
++
++	if (deleting_snapshot && !inode_u->bi_subvol) {
++		ret = -ENOENT;
++		goto err;
++	}
++
++	if (deleting_snapshot || inode_u->bi_subvol) {
++		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
++		if (ret)
++			goto err;
++
++		k = bch2_btree_iter_peek_slot(&dirent_iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		/*
++		 * If we're deleting a subvolume, we need to really delete the
++		 * dirent, not just emit a whiteout in the current snapshot:
++		 */
++		bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
++		ret = bch2_btree_iter_traverse(&dirent_iter);
++		if (ret)
++			goto err;
++	} else {
++		bch2_inode_nlink_dec(trans, inode_u);
++	}
++
++	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
++	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
++		inode_u->bi_dir		= 0;
++		inode_u->bi_dir_offset	= 0;
++	}
++
++	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
++	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
++
++	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
++				    &dir_hash, &dirent_iter,
++				    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
++		bch2_inode_write(trans, &dir_iter, dir_u) ?:
++		bch2_inode_write(trans, &inode_iter, inode_u);
++err:
++	bch2_trans_iter_exit(trans, &inode_iter);
++	bch2_trans_iter_exit(trans, &dirent_iter);
++	bch2_trans_iter_exit(trans, &dir_iter);
++	return ret;
++}
++
++bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
++			  struct bch_inode_unpacked *src_u)
++{
++	u64 src, dst;
++	unsigned id;
++	bool ret = false;
++
++	for (id = 0; id < Inode_opt_nr; id++) {
++		/* Skip attributes that were explicitly set on this inode */
++		if (dst_u->bi_fields_set & (1 << id))
++			continue;
++
++		src = bch2_inode_opt_get(src_u, id);
++		dst = bch2_inode_opt_get(dst_u, id);
++
++		if (src == dst)
++			continue;
++
++		bch2_inode_opt_set(dst_u, id, src);
++		ret = true;
++	}
++
++	return ret;
++}
++
++int bch2_rename_trans(struct btree_trans *trans,
++		      subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
++		      subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
++		      struct bch_inode_unpacked *src_inode_u,
++		      struct bch_inode_unpacked *dst_inode_u,
++		      const struct qstr *src_name,
++		      const struct qstr *dst_name,
++		      enum bch_rename_mode mode)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter src_dir_iter = { NULL };
++	struct btree_iter dst_dir_iter = { NULL };
++	struct btree_iter src_inode_iter = { NULL };
++	struct btree_iter dst_inode_iter = { NULL };
++	struct bch_hash_info src_hash, dst_hash;
++	subvol_inum src_inum, dst_inum;
++	u64 src_offset, dst_offset;
++	u64 now = bch2_current_time(c);
++	int ret;
++
++	ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
++			      BTREE_ITER_INTENT);
++	if (ret)
++		goto err;
++
++	src_hash = bch2_hash_info_init(c, src_dir_u);
++
++	if (dst_dir.inum	!= src_dir.inum ||
++	    dst_dir.subvol	!= src_dir.subvol) {
++		ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
++				      BTREE_ITER_INTENT);
++		if (ret)
++			goto err;
++
++		dst_hash = bch2_hash_info_init(c, dst_dir_u);
++	} else {
++		dst_dir_u = src_dir_u;
++		dst_hash = src_hash;
++	}
++
++	ret = bch2_dirent_rename(trans,
++				 src_dir, &src_hash,
++				 dst_dir, &dst_hash,
++				 src_name, &src_inum, &src_offset,
++				 dst_name, &dst_inum, &dst_offset,
++				 mode);
++	if (ret)
++		goto err;
++
++	ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
++			      BTREE_ITER_INTENT);
++	if (ret)
++		goto err;
++
++	if (dst_inum.inum) {
++		ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
++				      BTREE_ITER_INTENT);
++		if (ret)
++			goto err;
++	}
++
++	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
++		src_inode_u->bi_dir		= dst_dir_u->bi_inum;
++		src_inode_u->bi_dir_offset	= dst_offset;
++
++		if (mode == BCH_RENAME_EXCHANGE) {
++			dst_inode_u->bi_dir		= src_dir_u->bi_inum;
++			dst_inode_u->bi_dir_offset	= src_offset;
++		}
++
++		if (mode == BCH_RENAME_OVERWRITE &&
++		    dst_inode_u->bi_dir		== dst_dir_u->bi_inum &&
++		    dst_inode_u->bi_dir_offset	== src_offset) {
++			dst_inode_u->bi_dir		= 0;
++			dst_inode_u->bi_dir_offset	= 0;
++		}
++	}
++
++	if (mode == BCH_RENAME_OVERWRITE) {
++		if (S_ISDIR(src_inode_u->bi_mode) !=
++		    S_ISDIR(dst_inode_u->bi_mode)) {
++			ret = -ENOTDIR;
++			goto err;
++		}
++
++		if (S_ISDIR(dst_inode_u->bi_mode) &&
++		    bch2_empty_dir_trans(trans, dst_inum)) {
++			ret = -ENOTEMPTY;
++			goto err;
++		}
++	}
++
++	if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
++	    S_ISDIR(src_inode_u->bi_mode)) {
++		ret = -EXDEV;
++		goto err;
++	}
++
++	if (mode == BCH_RENAME_EXCHANGE &&
++	    bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
++	    S_ISDIR(dst_inode_u->bi_mode)) {
++		ret = -EXDEV;
++		goto err;
++	}
++
++	if (is_subdir_for_nlink(src_inode_u)) {
++		src_dir_u->bi_nlink--;
++		dst_dir_u->bi_nlink++;
++	}
++
++	if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
++		dst_dir_u->bi_nlink--;
++		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
++	}
++
++	if (mode == BCH_RENAME_OVERWRITE)
++		bch2_inode_nlink_dec(trans, dst_inode_u);
++
++	src_dir_u->bi_mtime		= now;
++	src_dir_u->bi_ctime		= now;
++
++	if (src_dir.inum != dst_dir.inum) {
++		dst_dir_u->bi_mtime	= now;
++		dst_dir_u->bi_ctime	= now;
++	}
++
++	src_inode_u->bi_ctime		= now;
++
++	if (dst_inum.inum)
++		dst_inode_u->bi_ctime	= now;
++
++	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
++		(src_dir.inum != dst_dir.inum
++		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
++		 : 0 ) ?:
++		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
++		(dst_inum.inum
++		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
++		 : 0 );
++err:
++	bch2_trans_iter_exit(trans, &dst_inode_iter);
++	bch2_trans_iter_exit(trans, &src_inode_iter);
++	bch2_trans_iter_exit(trans, &dst_dir_iter);
++	bch2_trans_iter_exit(trans, &src_dir_iter);
++	return ret;
++}
+diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
+new file mode 100644
+index 000000000000..dde237859514
+--- /dev/null
++++ b/fs/bcachefs/fs-common.h
+@@ -0,0 +1,43 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FS_COMMON_H
++#define _BCACHEFS_FS_COMMON_H
++
++struct posix_acl;
++
++#define BCH_CREATE_TMPFILE		(1U << 0)
++#define BCH_CREATE_SUBVOL		(1U << 1)
++#define BCH_CREATE_SNAPSHOT		(1U << 2)
++#define BCH_CREATE_SNAPSHOT_RO		(1U << 3)
++
++int bch2_create_trans(struct btree_trans *, subvol_inum,
++		      struct bch_inode_unpacked *,
++		      struct bch_inode_unpacked *,
++		      const struct qstr *,
++		      uid_t, gid_t, umode_t, dev_t,
++		      struct posix_acl *,
++		      struct posix_acl *,
++		      subvol_inum, unsigned);
++
++int bch2_link_trans(struct btree_trans *,
++		    subvol_inum, struct bch_inode_unpacked *,
++		    subvol_inum, struct bch_inode_unpacked *,
++		    const struct qstr *);
++
++int bch2_unlink_trans(struct btree_trans *, subvol_inum,
++		      struct bch_inode_unpacked *,
++		      struct bch_inode_unpacked *,
++		      const struct qstr *, bool);
++
++int bch2_rename_trans(struct btree_trans *,
++		      subvol_inum, struct bch_inode_unpacked *,
++		      subvol_inum, struct bch_inode_unpacked *,
++		      struct bch_inode_unpacked *,
++		      struct bch_inode_unpacked *,
++		      const struct qstr *,
++		      const struct qstr *,
++		      enum bch_rename_mode);
++
++bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
++			  struct bch_inode_unpacked *);
++
++#endif /* _BCACHEFS_FS_COMMON_H */
+diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
+new file mode 100644
+index 000000000000..f37bc43e27f4
+--- /dev/null
++++ b/fs/bcachefs/fs-io.c
+@@ -0,0 +1,3496 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifndef NO_BCACHEFS_FS
++
++#include "bcachefs.h"
++#include "alloc_foreground.h"
++#include "bkey_buf.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "clock.h"
++#include "error.h"
++#include "extents.h"
++#include "extent_update.h"
++#include "fs.h"
++#include "fs-io.h"
++#include "fsck.h"
++#include "inode.h"
++#include "journal.h"
++#include "io.h"
++#include "keylist.h"
++#include "quota.h"
++#include "reflink.h"
++
++#include <linux/aio.h>
++#include <linux/backing-dev.h>
++#include <linux/falloc.h>
++#include <linux/migrate.h>
++#include <linux/mmu_context.h>
++#include <linux/pagevec.h>
++#include <linux/rmap.h>
++#include <linux/sched/signal.h>
++#include <linux/task_io_accounting_ops.h>
++#include <linux/uio.h>
++#include <linux/writeback.h>
++
++#include <trace/events/bcachefs.h>
++#include <trace/events/writeback.h>
++
++static inline bool bio_full(struct bio *bio, unsigned len)
++{
++	if (bio->bi_vcnt >= bio->bi_max_vecs)
++		return true;
++	if (bio->bi_iter.bi_size > UINT_MAX - len)
++		return true;
++	return false;
++}
++
++static inline struct address_space *faults_disabled_mapping(void)
++{
++	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
++}
++
++static inline void set_fdm_dropped_locks(void)
++{
++	current->faults_disabled_mapping =
++		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
++}
++
++static inline bool fdm_dropped_locks(void)
++{
++	return ((unsigned long) current->faults_disabled_mapping) & 1;
++}
++
++struct quota_res {
++	u64				sectors;
++};
++
++struct bch_writepage_io {
++	struct closure			cl;
++	struct bch_inode_info		*inode;
++
++	/* must be last: */
++	struct bch_write_op		op;
++};
++
++struct dio_write {
++	struct completion		done;
++	struct kiocb			*req;
++	struct mm_struct		*mm;
++	unsigned			loop:1,
++					sync:1,
++					free_iov:1;
++	struct quota_res		quota_res;
++	u64				written;
++
++	struct iov_iter			iter;
++	struct iovec			inline_vecs[2];
++
++	/* must be last: */
++	struct bch_write_op		op;
++};
++
++struct dio_read {
++	struct closure			cl;
++	struct kiocb			*req;
++	long				ret;
++	bool				should_dirty;
++	struct bch_read_bio		rbio;
++};
++
++/* pagecache_block must be held */
++static int write_invalidate_inode_pages_range(struct address_space *mapping,
++					      loff_t start, loff_t end)
++{
++	int ret;
++
++	/*
++	 * XXX: the way this is currently implemented, we can spin if a process
++	 * is continually redirtying a specific page
++	 */
++	do {
++		if (!mapping->nrpages)
++			return 0;
++
++		ret = filemap_write_and_wait_range(mapping, start, end);
++		if (ret)
++			break;
++
++		if (!mapping->nrpages)
++			return 0;
++
++		ret = invalidate_inode_pages2_range(mapping,
++				start >> PAGE_SHIFT,
++				end >> PAGE_SHIFT);
++	} while (ret == -EBUSY);
++
++	return ret;
++}
++
++/* quotas */
++
++#ifdef CONFIG_BCACHEFS_QUOTA
++
++static void bch2_quota_reservation_put(struct bch_fs *c,
++				       struct bch_inode_info *inode,
++				       struct quota_res *res)
++{
++	if (!res->sectors)
++		return;
++
++	mutex_lock(&inode->ei_quota_lock);
++	BUG_ON(res->sectors > inode->ei_quota_reserved);
++
++	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
++			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
++	inode->ei_quota_reserved -= res->sectors;
++	mutex_unlock(&inode->ei_quota_lock);
++
++	res->sectors = 0;
++}
++
++static int bch2_quota_reservation_add(struct bch_fs *c,
++				      struct bch_inode_info *inode,
++				      struct quota_res *res,
++				      unsigned sectors,
++				      bool check_enospc)
++{
++	int ret;
++
++	mutex_lock(&inode->ei_quota_lock);
++	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
++			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
++	if (likely(!ret)) {
++		inode->ei_quota_reserved += sectors;
++		res->sectors += sectors;
++	}
++	mutex_unlock(&inode->ei_quota_lock);
++
++	return ret;
++}
++
++#else
++
++static void bch2_quota_reservation_put(struct bch_fs *c,
++				       struct bch_inode_info *inode,
++				       struct quota_res *res)
++{
++}
++
++static int bch2_quota_reservation_add(struct bch_fs *c,
++				      struct bch_inode_info *inode,
++				      struct quota_res *res,
++				      unsigned sectors,
++				      bool check_enospc)
++{
++	return 0;
++}
++
++#endif
++
++/* i_size updates: */
++
++struct inode_new_size {
++	loff_t		new_size;
++	u64		now;
++	unsigned	fields;
++};
++
++static int inode_set_size(struct bch_inode_info *inode,
++			  struct bch_inode_unpacked *bi,
++			  void *p)
++{
++	struct inode_new_size *s = p;
++
++	bi->bi_size = s->new_size;
++	if (s->fields & ATTR_ATIME)
++		bi->bi_atime = s->now;
++	if (s->fields & ATTR_MTIME)
++		bi->bi_mtime = s->now;
++	if (s->fields & ATTR_CTIME)
++		bi->bi_ctime = s->now;
++
++	return 0;
++}
++
++int __must_check bch2_write_inode_size(struct bch_fs *c,
++				       struct bch_inode_info *inode,
++				       loff_t new_size, unsigned fields)
++{
++	struct inode_new_size s = {
++		.new_size	= new_size,
++		.now		= bch2_current_time(c),
++		.fields		= fields,
++	};
++
++	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
++}
++
++static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
++			   struct quota_res *quota_res, s64 sectors)
++{
++	if (!sectors)
++		return;
++
++	mutex_lock(&inode->ei_quota_lock);
++	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
++				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
++				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
++				inode->ei_inode.bi_sectors);
++	inode->v.i_blocks += sectors;
++
++#ifdef CONFIG_BCACHEFS_QUOTA
++	if (quota_res && sectors > 0) {
++		BUG_ON(sectors > quota_res->sectors);
++		BUG_ON(sectors > inode->ei_quota_reserved);
++
++		quota_res->sectors -= sectors;
++		inode->ei_quota_reserved -= sectors;
++	} else {
++		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
++	}
++#endif
++	mutex_unlock(&inode->ei_quota_lock);
++}
++
++/* page state: */
++
++/* stored in page->private: */
++
++struct bch_page_sector {
++	/* Uncompressed, fully allocated replicas (or on disk reservation): */
++	unsigned		nr_replicas:4;
++
++	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
++	unsigned		replicas_reserved:4;
++
++	/* i_sectors: */
++	enum {
++		SECTOR_UNALLOCATED,
++		SECTOR_RESERVED,
++		SECTOR_DIRTY,
++		SECTOR_DIRTY_RESERVED,
++		SECTOR_ALLOCATED,
++	}			state:8;
++};
++
++struct bch_page_state {
++	spinlock_t		lock;
++	atomic_t		write_count;
++	bool			uptodate;
++	struct bch_page_sector	s[PAGE_SECTORS];
++};
++
++static inline struct bch_page_state *__bch2_page_state(struct page *page)
++{
++	return page_has_private(page)
++		? (struct bch_page_state *) page_private(page)
++		: NULL;
++}
++
++static inline struct bch_page_state *bch2_page_state(struct page *page)
++{
++	EBUG_ON(!PageLocked(page));
++
++	return __bch2_page_state(page);
++}
++
++/* for newly allocated pages: */
++static void __bch2_page_state_release(struct page *page)
++{
++	kfree(detach_page_private(page));
++}
++
++static void bch2_page_state_release(struct page *page)
++{
++	EBUG_ON(!PageLocked(page));
++	__bch2_page_state_release(page);
++}
++
++/* for newly allocated pages: */
++static struct bch_page_state *__bch2_page_state_create(struct page *page,
++						       gfp_t gfp)
++{
++	struct bch_page_state *s;
++
++	s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
++	if (!s)
++		return NULL;
++
++	spin_lock_init(&s->lock);
++	attach_page_private(page, s);
++	return s;
++}
++
++static struct bch_page_state *bch2_page_state_create(struct page *page,
++						     gfp_t gfp)
++{
++	return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
++}
++
++static unsigned bkey_to_sector_state(const struct bkey *k)
++{
++	if (k->type == KEY_TYPE_reservation)
++		return SECTOR_RESERVED;
++	if (bkey_extent_is_allocation(k))
++		return SECTOR_ALLOCATED;
++	return SECTOR_UNALLOCATED;
++}
++
++static void __bch2_page_state_set(struct page *page,
++				  unsigned pg_offset, unsigned pg_len,
++				  unsigned nr_ptrs, unsigned state)
++{
++	struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL);
++	unsigned i;
++
++	BUG_ON(pg_offset >= PAGE_SECTORS);
++	BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
++
++	spin_lock(&s->lock);
++
++	for (i = pg_offset; i < pg_offset + pg_len; i++) {
++		s->s[i].nr_replicas = nr_ptrs;
++		s->s[i].state = state;
++	}
++
++	if (i == PAGE_SECTORS)
++		s->uptodate = true;
++
++	spin_unlock(&s->lock);
++}
++
++static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum,
++			       struct page **pages, unsigned nr_pages)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT;
++	unsigned pg_idx = 0;
++	u32 snapshot;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
++			   SPOS(inum.inum, offset, snapshot),
++			   BTREE_ITER_SLOTS, k, ret) {
++		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
++		unsigned state = bkey_to_sector_state(k.k);
++
++		while (pg_idx < nr_pages) {
++			struct page *page = pages[pg_idx];
++			u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
++			u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
++			unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start;
++			unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start;
++
++			BUG_ON(k.k->p.offset < pg_start);
++			BUG_ON(bkey_start_offset(k.k) > pg_end);
++
++			if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate)
++				__bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state);
++
++			if (k.k->p.offset < pg_end)
++				break;
++			pg_idx++;
++		}
++
++		if (pg_idx == nr_pages)
++			break;
++	}
++
++	offset = iter.pos.offset;
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++	bch2_trans_exit(&trans);
++
++	return ret;
++}
++
++static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
++{
++	struct bvec_iter iter;
++	struct bio_vec bv;
++	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
++		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
++	unsigned state = bkey_to_sector_state(k.k);
++
++	bio_for_each_segment(bv, bio, iter)
++		__bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
++				      bv.bv_len >> 9, nr_ptrs, state);
++}
++
++static void mark_pagecache_unallocated(struct bch_inode_info *inode,
++				       u64 start, u64 end)
++{
++	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
++	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
++	struct pagevec pvec;
++
++	if (end <= start)
++		return;
++
++	pagevec_init(&pvec);
++
++	do {
++		unsigned nr_pages, i, j;
++
++		nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
++						&index, end_index);
++		for (i = 0; i < nr_pages; i++) {
++			struct page *page = pvec.pages[i];
++			u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
++			u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
++			unsigned pg_offset = max(start, pg_start) - pg_start;
++			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
++			struct bch_page_state *s;
++
++			BUG_ON(end <= pg_start);
++			BUG_ON(pg_offset >= PAGE_SECTORS);
++			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
++
++			lock_page(page);
++			s = bch2_page_state(page);
++
++			if (s) {
++				spin_lock(&s->lock);
++				for (j = pg_offset; j < pg_offset + pg_len; j++)
++					s->s[j].nr_replicas = 0;
++				spin_unlock(&s->lock);
++			}
++
++			unlock_page(page);
++		}
++		pagevec_release(&pvec);
++	} while (index <= end_index);
++}
++
++static void mark_pagecache_reserved(struct bch_inode_info *inode,
++				    u64 start, u64 end)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
++	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
++	struct pagevec pvec;
++	s64 i_sectors_delta = 0;
++
++	if (end <= start)
++		return;
++
++	pagevec_init(&pvec);
++
++	do {
++		unsigned nr_pages, i, j;
++
++		nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
++						&index, end_index);
++		for (i = 0; i < nr_pages; i++) {
++			struct page *page = pvec.pages[i];
++			u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
++			u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
++			unsigned pg_offset = max(start, pg_start) - pg_start;
++			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
++			struct bch_page_state *s;
++
++			BUG_ON(end <= pg_start);
++			BUG_ON(pg_offset >= PAGE_SECTORS);
++			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
++
++			lock_page(page);
++			s = bch2_page_state(page);
++
++			if (s) {
++				spin_lock(&s->lock);
++				for (j = pg_offset; j < pg_offset + pg_len; j++)
++					switch (s->s[j].state) {
++					case SECTOR_UNALLOCATED:
++						s->s[j].state = SECTOR_RESERVED;
++						break;
++					case SECTOR_DIRTY:
++						s->s[j].state = SECTOR_DIRTY_RESERVED;
++						i_sectors_delta--;
++						break;
++					default:
++						break;
++					}
++				spin_unlock(&s->lock);
++			}
++
++			unlock_page(page);
++		}
++		pagevec_release(&pvec);
++	} while (index <= end_index);
++
++	i_sectors_acct(c, inode, NULL, i_sectors_delta);
++}
++
++static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
++{
++	/* XXX: this should not be open coded */
++	return inode->ei_inode.bi_data_replicas
++		? inode->ei_inode.bi_data_replicas - 1
++		: c->opts.data_replicas;
++}
++
++static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
++						  unsigned nr_replicas)
++{
++	return max(0, (int) nr_replicas -
++		   s->nr_replicas -
++		   s->replicas_reserved);
++}
++
++static int bch2_get_page_disk_reservation(struct bch_fs *c,
++				struct bch_inode_info *inode,
++				struct page *page, bool check_enospc)
++{
++	struct bch_page_state *s = bch2_page_state_create(page, 0);
++	unsigned nr_replicas = inode_nr_replicas(c, inode);
++	struct disk_reservation disk_res = { 0 };
++	unsigned i, disk_res_sectors = 0;
++	int ret;
++
++	if (!s)
++		return -ENOMEM;
++
++	for (i = 0; i < ARRAY_SIZE(s->s); i++)
++		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
++
++	if (!disk_res_sectors)
++		return 0;
++
++	ret = bch2_disk_reservation_get(c, &disk_res,
++					disk_res_sectors, 1,
++					!check_enospc
++					? BCH_DISK_RESERVATION_NOFAIL
++					: 0);
++	if (unlikely(ret))
++		return ret;
++
++	for (i = 0; i < ARRAY_SIZE(s->s); i++)
++		s->s[i].replicas_reserved +=
++			sectors_to_reserve(&s->s[i], nr_replicas);
++
++	return 0;
++}
++
++struct bch2_page_reservation {
++	struct disk_reservation	disk;
++	struct quota_res	quota;
++};
++
++static void bch2_page_reservation_init(struct bch_fs *c,
++			struct bch_inode_info *inode,
++			struct bch2_page_reservation *res)
++{
++	memset(res, 0, sizeof(*res));
++
++	res->disk.nr_replicas = inode_nr_replicas(c, inode);
++}
++
++static void bch2_page_reservation_put(struct bch_fs *c,
++			struct bch_inode_info *inode,
++			struct bch2_page_reservation *res)
++{
++	bch2_disk_reservation_put(c, &res->disk);
++	bch2_quota_reservation_put(c, inode, &res->quota);
++}
++
++static int bch2_page_reservation_get(struct bch_fs *c,
++			struct bch_inode_info *inode, struct page *page,
++			struct bch2_page_reservation *res,
++			unsigned offset, unsigned len, bool check_enospc)
++{
++	struct bch_page_state *s = bch2_page_state_create(page, 0);
++	unsigned i, disk_sectors = 0, quota_sectors = 0;
++	int ret;
++
++	if (!s)
++		return -ENOMEM;
++
++	BUG_ON(!s->uptodate);
++
++	for (i = round_down(offset, block_bytes(c)) >> 9;
++	     i < round_up(offset + len, block_bytes(c)) >> 9;
++	     i++) {
++		disk_sectors += sectors_to_reserve(&s->s[i],
++						res->disk.nr_replicas);
++		quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
++	}
++
++	if (disk_sectors) {
++		ret = bch2_disk_reservation_add(c, &res->disk,
++						disk_sectors,
++						!check_enospc
++						? BCH_DISK_RESERVATION_NOFAIL
++						: 0);
++		if (unlikely(ret))
++			return ret;
++	}
++
++	if (quota_sectors) {
++		ret = bch2_quota_reservation_add(c, inode, &res->quota,
++						 quota_sectors,
++						 check_enospc);
++		if (unlikely(ret)) {
++			struct disk_reservation tmp = {
++				.sectors = disk_sectors
++			};
++
++			bch2_disk_reservation_put(c, &tmp);
++			res->disk.sectors -= disk_sectors;
++			return ret;
++		}
++	}
++
++	return 0;
++}
++
++static void bch2_clear_page_bits(struct page *page)
++{
++	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_page_state *s = bch2_page_state(page);
++	struct disk_reservation disk_res = { 0 };
++	int i, dirty_sectors = 0;
++
++	if (!s)
++		return;
++
++	EBUG_ON(!PageLocked(page));
++	EBUG_ON(PageWriteback(page));
++
++	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
++		disk_res.sectors += s->s[i].replicas_reserved;
++		s->s[i].replicas_reserved = 0;
++
++		switch (s->s[i].state) {
++		case SECTOR_DIRTY:
++			s->s[i].state = SECTOR_UNALLOCATED;
++			--dirty_sectors;
++			break;
++		case SECTOR_DIRTY_RESERVED:
++			s->s[i].state = SECTOR_RESERVED;
++			break;
++		default:
++			break;
++		}
++	}
++
++	bch2_disk_reservation_put(c, &disk_res);
++
++	i_sectors_acct(c, inode, NULL, dirty_sectors);
++
++	bch2_page_state_release(page);
++}
++
++static void bch2_set_page_dirty(struct bch_fs *c,
++			struct bch_inode_info *inode, struct page *page,
++			struct bch2_page_reservation *res,
++			unsigned offset, unsigned len)
++{
++	struct bch_page_state *s = bch2_page_state(page);
++	unsigned i, dirty_sectors = 0;
++
++	WARN_ON((u64) page_offset(page) + offset + len >
++		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
++
++	spin_lock(&s->lock);
++
++	for (i = round_down(offset, block_bytes(c)) >> 9;
++	     i < round_up(offset + len, block_bytes(c)) >> 9;
++	     i++) {
++		unsigned sectors = sectors_to_reserve(&s->s[i],
++						res->disk.nr_replicas);
++
++		/*
++		 * This can happen if we race with the error path in
++		 * bch2_writepage_io_done():
++		 */
++		sectors = min_t(unsigned, sectors, res->disk.sectors);
++
++		s->s[i].replicas_reserved += sectors;
++		res->disk.sectors -= sectors;
++
++		switch (s->s[i].state) {
++		case SECTOR_UNALLOCATED:
++			s->s[i].state = SECTOR_DIRTY;
++			dirty_sectors++;
++			break;
++		case SECTOR_RESERVED:
++			s->s[i].state = SECTOR_DIRTY_RESERVED;
++			break;
++		default:
++			break;
++		}
++	}
++
++	spin_unlock(&s->lock);
++
++	i_sectors_acct(c, inode, &res->quota, dirty_sectors);
++
++	if (!PageDirty(page))
++		__set_page_dirty_nobuffers(page);
++}
++
++vm_fault_t bch2_page_fault(struct vm_fault *vmf)
++{
++	struct file *file = vmf->vma->vm_file;
++	struct address_space *mapping = file->f_mapping;
++	struct address_space *fdm = faults_disabled_mapping();
++	struct bch_inode_info *inode = file_bch_inode(file);
++	int ret;
++
++	if (fdm == mapping)
++		return VM_FAULT_SIGBUS;
++
++	/* Lock ordering: */
++	if (fdm > mapping) {
++		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
++
++		if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
++			goto got_lock;
++
++		bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
++
++		bch2_pagecache_add_get(&inode->ei_pagecache_lock);
++		bch2_pagecache_add_put(&inode->ei_pagecache_lock);
++
++		bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
++
++		/* Signal that lock has been dropped: */
++		set_fdm_dropped_locks();
++		return VM_FAULT_SIGBUS;
++	}
++
++	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
++got_lock:
++	ret = filemap_fault(vmf);
++	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
++
++	return ret;
++}
++
++vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
++{
++	struct page *page = vmf->page;
++	struct file *file = vmf->vma->vm_file;
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct address_space *mapping = file->f_mapping;
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch2_page_reservation res;
++	unsigned len;
++	loff_t isize;
++	int ret;
++
++	bch2_page_reservation_init(c, inode, &res);
++
++	sb_start_pagefault(inode->v.i_sb);
++	file_update_time(file);
++
++	/*
++	 * Not strictly necessary, but helps avoid dio writes livelocking in
++	 * write_invalidate_inode_pages_range() - can drop this if/when we get
++	 * a write_invalidate_inode_pages_range() that works without dropping
++	 * page lock before invalidating page
++	 */
++	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
++
++	lock_page(page);
++	isize = i_size_read(&inode->v);
++
++	if (page->mapping != mapping || page_offset(page) >= isize) {
++		unlock_page(page);
++		ret = VM_FAULT_NOPAGE;
++		goto out;
++	}
++
++	len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
++
++	if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
++		if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) {
++			unlock_page(page);
++			ret = VM_FAULT_SIGBUS;
++			goto out;
++		}
++	}
++
++	if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
++		unlock_page(page);
++		ret = VM_FAULT_SIGBUS;
++		goto out;
++	}
++
++	bch2_set_page_dirty(c, inode, page, &res, 0, len);
++	bch2_page_reservation_put(c, inode, &res);
++
++	wait_for_stable_page(page);
++	ret = VM_FAULT_LOCKED;
++out:
++	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
++	sb_end_pagefault(inode->v.i_sb);
++
++	return ret;
++}
++
++void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
++{
++	if (offset || length < folio_size(folio))
++		return;
++
++	bch2_clear_page_bits(&folio->page);
++}
++
++int bch2_releasepage(struct page *page, gfp_t gfp_mask)
++{
++	if (PageDirty(page))
++		return 0;
++
++	bch2_clear_page_bits(page);
++	return 1;
++}
++
++#ifdef CONFIG_MIGRATION
++int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
++		      struct page *page, enum migrate_mode mode)
++{
++	int ret;
++
++	EBUG_ON(!PageLocked(page));
++	EBUG_ON(!PageLocked(newpage));
++
++	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
++	if (ret != MIGRATEPAGE_SUCCESS)
++		return ret;
++
++	if (PagePrivate(page))
++		attach_page_private(newpage, detach_page_private(page));
++
++	if (mode != MIGRATE_SYNC_NO_COPY)
++		migrate_page_copy(newpage, page);
++	else
++		migrate_page_states(newpage, page);
++	return MIGRATEPAGE_SUCCESS;
++}
++#endif
++
++/* readpage(s): */
++
++static void bch2_readpages_end_io(struct bio *bio)
++{
++	struct bvec_iter_all iter;
++	struct bio_vec *bv;
++
++	bio_for_each_segment_all(bv, bio, iter) {
++		struct page *page = bv->bv_page;
++
++		if (!bio->bi_status) {
++			SetPageUptodate(page);
++		} else {
++			ClearPageUptodate(page);
++			SetPageError(page);
++		}
++		unlock_page(page);
++	}
++
++	bio_put(bio);
++}
++
++struct readpages_iter {
++	struct address_space	*mapping;
++	struct page		**pages;
++	unsigned		nr_pages;
++	unsigned		idx;
++	pgoff_t			offset;
++};
++
++static int readpages_iter_init(struct readpages_iter *iter,
++			       struct readahead_control *ractl)
++{
++	unsigned i, nr_pages = readahead_count(ractl);
++
++	memset(iter, 0, sizeof(*iter));
++
++	iter->mapping	= ractl->mapping;
++	iter->offset	= readahead_index(ractl);
++	iter->nr_pages	= nr_pages;
++
++	iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
++	if (!iter->pages)
++		return -ENOMEM;
++
++	nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
++	for (i = 0; i < nr_pages; i++) {
++		__bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
++		put_page(iter->pages[i]);
++	}
++
++	return 0;
++}
++
++static inline struct page *readpage_iter_next(struct readpages_iter *iter)
++{
++	if (iter->idx >= iter->nr_pages)
++		return NULL;
++
++	EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
++
++	return iter->pages[iter->idx];
++}
++
++static bool extent_partial_reads_expensive(struct bkey_s_c k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	struct bch_extent_crc_unpacked crc;
++	const union bch_extent_entry *i;
++
++	bkey_for_each_crc(k.k, ptrs, crc, i)
++		if (crc.csum_type || crc.compression_type)
++			return true;
++	return false;
++}
++
++static void readpage_bio_extend(struct readpages_iter *iter,
++				struct bio *bio,
++				unsigned sectors_this_extent,
++				bool get_more)
++{
++	while (bio_sectors(bio) < sectors_this_extent &&
++	       bio->bi_vcnt < bio->bi_max_vecs) {
++		pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
++		struct page *page = readpage_iter_next(iter);
++		int ret;
++
++		if (page) {
++			if (iter->offset + iter->idx != page_offset)
++				break;
++
++			iter->idx++;
++		} else {
++			if (!get_more)
++				break;
++
++			page = xa_load(&iter->mapping->i_pages, page_offset);
++			if (page && !xa_is_value(page))
++				break;
++
++			page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
++			if (!page)
++				break;
++
++			if (!__bch2_page_state_create(page, 0)) {
++				put_page(page);
++				break;
++			}
++
++			ret = add_to_page_cache_lru(page, iter->mapping,
++						    page_offset, GFP_NOFS);
++			if (ret) {
++				__bch2_page_state_release(page);
++				put_page(page);
++				break;
++			}
++
++			put_page(page);
++		}
++
++		BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
++	}
++}
++
++static void bchfs_read(struct btree_trans *trans,
++		       struct bch_read_bio *rbio,
++		       subvol_inum inum,
++		       struct readpages_iter *readpages_iter)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_buf sk;
++	int flags = BCH_READ_RETRY_IF_STALE|
++		BCH_READ_MAY_PROMOTE;
++	u32 snapshot;
++	int ret = 0;
++
++	rbio->c = c;
++	rbio->start_time = local_clock();
++	rbio->subvol = inum.subvol;
++
++	bch2_bkey_buf_init(&sk);
++retry:
++	bch2_trans_begin(trans);
++	iter = (struct btree_iter) { NULL };
++
++	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
++			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
++			     BTREE_ITER_SLOTS);
++	while (1) {
++		struct bkey_s_c k;
++		unsigned bytes, sectors, offset_into_extent;
++		enum btree_id data_btree = BTREE_ID_extents;
++
++		/*
++		 * read_extent -> io_time_reset may cause a transaction restart
++		 * without returning an error, we need to check for that here:
++		 */
++		ret = bch2_trans_relock(trans);
++		if (ret)
++			break;
++
++		bch2_btree_iter_set_pos(&iter,
++				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
++
++		k = bch2_btree_iter_peek_slot(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			break;
++
++		offset_into_extent = iter.pos.offset -
++			bkey_start_offset(k.k);
++		sectors = k.k->size - offset_into_extent;
++
++		bch2_bkey_buf_reassemble(&sk, c, k);
++
++		ret = bch2_read_indirect_extent(trans, &data_btree,
++					&offset_into_extent, &sk);
++		if (ret)
++			break;
++
++		k = bkey_i_to_s_c(sk.k);
++
++		sectors = min(sectors, k.k->size - offset_into_extent);
++
++		if (readpages_iter)
++			readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
++					    extent_partial_reads_expensive(k));
++
++		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
++		swap(rbio->bio.bi_iter.bi_size, bytes);
++
++		if (rbio->bio.bi_iter.bi_size == bytes)
++			flags |= BCH_READ_LAST_FRAGMENT;
++
++		bch2_bio_page_state_set(&rbio->bio, k);
++
++		bch2_read_extent(trans, rbio, iter.pos,
++				 data_btree, k, offset_into_extent, flags);
++
++		if (flags & BCH_READ_LAST_FRAGMENT)
++			break;
++
++		swap(rbio->bio.bi_iter.bi_size, bytes);
++		bio_advance(&rbio->bio, bytes);
++
++		ret = btree_trans_too_many_iters(trans);
++		if (ret)
++			break;
++	}
++err:
++	bch2_trans_iter_exit(trans, &iter);
++
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	if (ret) {
++		bch_err_inum_ratelimited(c, inum.inum,
++				"read error %i from btree lookup", ret);
++		rbio->bio.bi_status = BLK_STS_IOERR;
++		bio_endio(&rbio->bio);
++	}
++
++	bch2_bkey_buf_exit(&sk, c);
++}
++
++void bch2_readahead(struct readahead_control *ractl)
++{
++	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
++	struct btree_trans trans;
++	struct page *page;
++	struct readpages_iter readpages_iter;
++	int ret;
++
++	ret = readpages_iter_init(&readpages_iter, ractl);
++	BUG_ON(ret);
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
++
++	while ((page = readpage_iter_next(&readpages_iter))) {
++		pgoff_t index = readpages_iter.offset + readpages_iter.idx;
++		unsigned n = min_t(unsigned,
++				   readpages_iter.nr_pages -
++				   readpages_iter.idx,
++				   BIO_MAX_VECS);
++		struct bch_read_bio *rbio =
++			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
++						   GFP_NOFS, &c->bio_read),
++				  opts);
++
++		readpages_iter.idx++;
++
++		rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
++		rbio->bio.bi_end_io = bch2_readpages_end_io;
++		BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
++
++		bchfs_read(&trans, rbio, inode_inum(inode),
++			   &readpages_iter);
++	}
++
++	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
++
++	bch2_trans_exit(&trans);
++	kfree(readpages_iter.pages);
++}
++
++static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
++			     subvol_inum inum, struct page *page)
++{
++	struct btree_trans trans;
++
++	bch2_page_state_create(page, __GFP_NOFAIL);
++
++	bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
++	rbio->bio.bi_iter.bi_sector =
++		(sector_t) page->index << PAGE_SECTORS_SHIFT;
++	BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bchfs_read(&trans, rbio, inum, NULL);
++	bch2_trans_exit(&trans);
++}
++
++int bch2_readpage(struct file *file, struct page *page)
++{
++	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
++	struct bch_read_bio *rbio;
++
++	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read), opts);
++	rbio->bio.bi_end_io = bch2_readpages_end_io;
++
++	__bchfs_readpage(c, rbio, inode_inum(inode), page);
++	return 0;
++}
++
++static void bch2_read_single_page_end_io(struct bio *bio)
++{
++	complete(bio->bi_private);
++}
++
++static int bch2_read_single_page(struct page *page,
++				 struct address_space *mapping)
++{
++	struct bch_inode_info *inode = to_bch_ei(mapping->host);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_read_bio *rbio;
++	int ret;
++	DECLARE_COMPLETION_ONSTACK(done);
++
++	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
++			 io_opts(c, &inode->ei_inode));
++	rbio->bio.bi_private = &done;
++	rbio->bio.bi_end_io = bch2_read_single_page_end_io;
++
++	__bchfs_readpage(c, rbio, inode_inum(inode), page);
++	wait_for_completion(&done);
++
++	ret = blk_status_to_errno(rbio->bio.bi_status);
++	bio_put(&rbio->bio);
++
++	if (ret < 0)
++		return ret;
++
++	SetPageUptodate(page);
++	return 0;
++}
++
++/* writepages: */
++
++struct bch_writepage_state {
++	struct bch_writepage_io	*io;
++	struct bch_io_opts	opts;
++};
++
++static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
++								  struct bch_inode_info *inode)
++{
++	return (struct bch_writepage_state) {
++		.opts = io_opts(c, &inode->ei_inode)
++	};
++}
++
++static void bch2_writepage_io_free(struct closure *cl)
++{
++	struct bch_writepage_io *io = container_of(cl,
++					struct bch_writepage_io, cl);
++
++	bio_put(&io->op.wbio.bio);
++}
++
++static void bch2_writepage_io_done(struct closure *cl)
++{
++	struct bch_writepage_io *io = container_of(cl,
++					struct bch_writepage_io, cl);
++	struct bch_fs *c = io->op.c;
++	struct bio *bio = &io->op.wbio.bio;
++	struct bvec_iter_all iter;
++	struct bio_vec *bvec;
++	unsigned i;
++
++	up(&io->op.c->io_in_flight);
++
++	if (io->op.error) {
++		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
++
++		bio_for_each_segment_all(bvec, bio, iter) {
++			struct bch_page_state *s;
++
++			SetPageError(bvec->bv_page);
++			mapping_set_error(bvec->bv_page->mapping, -EIO);
++
++			s = __bch2_page_state(bvec->bv_page);
++			spin_lock(&s->lock);
++			for (i = 0; i < PAGE_SECTORS; i++)
++				s->s[i].nr_replicas = 0;
++			spin_unlock(&s->lock);
++		}
++	}
++
++	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
++		bio_for_each_segment_all(bvec, bio, iter) {
++			struct bch_page_state *s;
++
++			s = __bch2_page_state(bvec->bv_page);
++			spin_lock(&s->lock);
++			for (i = 0; i < PAGE_SECTORS; i++)
++				s->s[i].nr_replicas = 0;
++			spin_unlock(&s->lock);
++		}
++	}
++
++	/*
++	 * racing with fallocate can cause us to add fewer sectors than
++	 * expected - but we shouldn't add more sectors than expected:
++	 */
++	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
++
++	/*
++	 * (error (due to going RO) halfway through a page can screw that up
++	 * slightly)
++	 * XXX wtf?
++	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
++	 */
++
++	/*
++	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
++	 * before calling end_page_writeback:
++	 */
++	i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
++
++	bio_for_each_segment_all(bvec, bio, iter) {
++		struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
++
++		if (atomic_dec_and_test(&s->write_count))
++			end_page_writeback(bvec->bv_page);
++	}
++
++	closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
++}
++
++static void bch2_writepage_do_io(struct bch_writepage_state *w)
++{
++	struct bch_writepage_io *io = w->io;
++
++	down(&io->op.c->io_in_flight);
++
++	w->io = NULL;
++	closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
++	continue_at(&io->cl, bch2_writepage_io_done, NULL);
++}
++
++/*
++ * Get a bch_writepage_io and add @page to it - appending to an existing one if
++ * possible, else allocating a new one:
++ */
++static void bch2_writepage_io_alloc(struct bch_fs *c,
++				    struct writeback_control *wbc,
++				    struct bch_writepage_state *w,
++				    struct bch_inode_info *inode,
++				    u64 sector,
++				    unsigned nr_replicas)
++{
++	struct bch_write_op *op;
++
++	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
++					      REQ_OP_WRITE,
++					      GFP_NOFS,
++					      &c->writepage_bioset),
++			     struct bch_writepage_io, op.wbio.bio);
++
++	closure_init(&w->io->cl, NULL);
++	w->io->inode		= inode;
++
++	op			= &w->io->op;
++	bch2_write_op_init(op, c, w->opts);
++	op->target		= w->opts.foreground_target;
++	op->nr_replicas		= nr_replicas;
++	op->res.nr_replicas	= nr_replicas;
++	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
++	op->subvol		= inode->ei_subvol;
++	op->pos			= POS(inode->v.i_ino, sector);
++	op->wbio.bio.bi_iter.bi_sector = sector;
++	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
++}
++
++static int __bch2_writepage(struct page *page,
++			    struct writeback_control *wbc,
++			    void *data)
++{
++	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_writepage_state *w = data;
++	struct bch_page_state *s, orig;
++	unsigned i, offset, nr_replicas_this_write = U32_MAX;
++	loff_t i_size = i_size_read(&inode->v);
++	pgoff_t end_index = i_size >> PAGE_SHIFT;
++	int ret;
++
++	EBUG_ON(!PageUptodate(page));
++
++	/* Is the page fully inside i_size? */
++	if (page->index < end_index)
++		goto do_io;
++
++	/* Is the page fully outside i_size? (truncate in progress) */
++	offset = i_size & (PAGE_SIZE - 1);
++	if (page->index > end_index || !offset) {
++		unlock_page(page);
++		return 0;
++	}
++
++	/*
++	 * The page straddles i_size.  It must be zeroed out on each and every
++	 * writepage invocation because it may be mmapped.  "A file is mapped
++	 * in multiples of the page size.  For a file that is not a multiple of
++	 * the  page size, the remaining memory is zeroed when mapped, and
++	 * writes to that region are not written out to the file."
++	 */
++	zero_user_segment(page, offset, PAGE_SIZE);
++do_io:
++	s = bch2_page_state_create(page, __GFP_NOFAIL);
++
++	/*
++	 * Things get really hairy with errors during writeback:
++	 */
++	ret = bch2_get_page_disk_reservation(c, inode, page, false);
++	BUG_ON(ret);
++
++	/* Before unlocking the page, get copy of reservations: */
++	spin_lock(&s->lock);
++	orig = *s;
++	spin_unlock(&s->lock);
++
++	for (i = 0; i < PAGE_SECTORS; i++) {
++		if (s->s[i].state < SECTOR_DIRTY)
++			continue;
++
++		nr_replicas_this_write =
++			min_t(unsigned, nr_replicas_this_write,
++			      s->s[i].nr_replicas +
++			      s->s[i].replicas_reserved);
++	}
++
++	for (i = 0; i < PAGE_SECTORS; i++) {
++		if (s->s[i].state < SECTOR_DIRTY)
++			continue;
++
++		s->s[i].nr_replicas = w->opts.compression
++			? 0 : nr_replicas_this_write;
++
++		s->s[i].replicas_reserved = 0;
++		s->s[i].state = SECTOR_ALLOCATED;
++	}
++
++	BUG_ON(atomic_read(&s->write_count));
++	atomic_set(&s->write_count, 1);
++
++	BUG_ON(PageWriteback(page));
++	set_page_writeback(page);
++
++	unlock_page(page);
++
++	offset = 0;
++	while (1) {
++		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
++		u64 sector;
++
++		while (offset < PAGE_SECTORS &&
++		       orig.s[offset].state < SECTOR_DIRTY)
++			offset++;
++
++		if (offset == PAGE_SECTORS)
++			break;
++
++		while (offset + sectors < PAGE_SECTORS &&
++		       orig.s[offset + sectors].state >= SECTOR_DIRTY) {
++			reserved_sectors += orig.s[offset + sectors].replicas_reserved;
++			dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY;
++			sectors++;
++		}
++		BUG_ON(!sectors);
++
++		sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset;
++
++		if (w->io &&
++		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
++		     bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
++		     w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
++		     (BIO_MAX_VECS * PAGE_SIZE) ||
++		     bio_end_sector(&w->io->op.wbio.bio) != sector))
++			bch2_writepage_do_io(w);
++
++		if (!w->io)
++			bch2_writepage_io_alloc(c, wbc, w, inode, sector,
++						nr_replicas_this_write);
++
++		atomic_inc(&s->write_count);
++
++		BUG_ON(inode != w->io->inode);
++		BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page,
++				     sectors << 9, offset << 9));
++
++		/* Check for writing past i_size: */
++		WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
++			     round_up(i_size, block_bytes(c)));
++
++		w->io->op.res.sectors += reserved_sectors;
++		w->io->op.i_sectors_delta -= dirty_sectors;
++		w->io->op.new_i_size = i_size;
++
++		offset += sectors;
++	}
++
++	if (atomic_dec_and_test(&s->write_count))
++		end_page_writeback(page);
++
++	return 0;
++}
++
++int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
++{
++	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
++	struct bch_writepage_state w =
++		bch_writepage_state_init(c, to_bch_ei(mapping->host));
++	struct blk_plug plug;
++	int ret;
++
++	blk_start_plug(&plug);
++	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
++	if (w.io)
++		bch2_writepage_do_io(&w);
++	blk_finish_plug(&plug);
++	return ret;
++}
++
++/* buffered writes: */
++
++int bch2_write_begin(struct file *file, struct address_space *mapping,
++		     loff_t pos, unsigned len, unsigned flags,
++		     struct page **pagep, void **fsdata)
++{
++	struct bch_inode_info *inode = to_bch_ei(mapping->host);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch2_page_reservation *res;
++	pgoff_t index = pos >> PAGE_SHIFT;
++	unsigned offset = pos & (PAGE_SIZE - 1);
++	struct page *page;
++	int ret = -ENOMEM;
++
++	res = kmalloc(sizeof(*res), GFP_KERNEL);
++	if (!res)
++		return -ENOMEM;
++
++	bch2_page_reservation_init(c, inode, res);
++	*fsdata = res;
++
++	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
++
++	page = grab_cache_page_write_begin(mapping, index, flags);
++	if (!page)
++		goto err_unlock;
++
++	if (PageUptodate(page))
++		goto out;
++
++	/* If we're writing entire page, don't need to read it in first: */
++	if (len == PAGE_SIZE)
++		goto out;
++
++	if (!offset && pos + len >= inode->v.i_size) {
++		zero_user_segment(page, len, PAGE_SIZE);
++		flush_dcache_page(page);
++		goto out;
++	}
++
++	if (index > inode->v.i_size >> PAGE_SHIFT) {
++		zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
++		flush_dcache_page(page);
++		goto out;
++	}
++readpage:
++	ret = bch2_read_single_page(page, mapping);
++	if (ret)
++		goto err;
++out:
++	if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
++		ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
++		if (ret)
++			goto out;
++	}
++
++	ret = bch2_page_reservation_get(c, inode, page, res,
++					offset, len, true);
++	if (ret) {
++		if (!PageUptodate(page)) {
++			/*
++			 * If the page hasn't been read in, we won't know if we
++			 * actually need a reservation - we don't actually need
++			 * to read here, we just need to check if the page is
++			 * fully backed by uncompressed data:
++			 */
++			goto readpage;
++		}
++
++		goto err;
++	}
++
++	*pagep = page;
++	return 0;
++err:
++	unlock_page(page);
++	put_page(page);
++	*pagep = NULL;
++err_unlock:
++	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
++	kfree(res);
++	*fsdata = NULL;
++	return ret;
++}
++
++int bch2_write_end(struct file *file, struct address_space *mapping,
++		   loff_t pos, unsigned len, unsigned copied,
++		   struct page *page, void *fsdata)
++{
++	struct bch_inode_info *inode = to_bch_ei(mapping->host);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch2_page_reservation *res = fsdata;
++	unsigned offset = pos & (PAGE_SIZE - 1);
++
++	lockdep_assert_held(&inode->v.i_rwsem);
++
++	if (unlikely(copied < len && !PageUptodate(page))) {
++		/*
++		 * The page needs to be read in, but that would destroy
++		 * our partial write - simplest thing is to just force
++		 * userspace to redo the write:
++		 */
++		zero_user(page, 0, PAGE_SIZE);
++		flush_dcache_page(page);
++		copied = 0;
++	}
++
++	spin_lock(&inode->v.i_lock);
++	if (pos + copied > inode->v.i_size)
++		i_size_write(&inode->v, pos + copied);
++	spin_unlock(&inode->v.i_lock);
++
++	if (copied) {
++		if (!PageUptodate(page))
++			SetPageUptodate(page);
++
++		bch2_set_page_dirty(c, inode, page, res, offset, copied);
++
++		inode->ei_last_dirtied = (unsigned long) current;
++	}
++
++	unlock_page(page);
++	put_page(page);
++	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
++
++	bch2_page_reservation_put(c, inode, res);
++	kfree(res);
++
++	return copied;
++}
++
++#define WRITE_BATCH_PAGES	32
++
++static int __bch2_buffered_write(struct bch_inode_info *inode,
++				 struct address_space *mapping,
++				 struct iov_iter *iter,
++				 loff_t pos, unsigned len)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct page *pages[WRITE_BATCH_PAGES];
++	struct bch2_page_reservation res;
++	unsigned long index = pos >> PAGE_SHIFT;
++	unsigned offset = pos & (PAGE_SIZE - 1);
++	unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
++	unsigned i, reserved = 0, set_dirty = 0;
++	unsigned copied = 0, nr_pages_copied = 0;
++	int ret = 0;
++
++	BUG_ON(!len);
++	BUG_ON(nr_pages > ARRAY_SIZE(pages));
++
++	bch2_page_reservation_init(c, inode, &res);
++
++	for (i = 0; i < nr_pages; i++) {
++		pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
++		if (!pages[i]) {
++			nr_pages = i;
++			if (!i) {
++				ret = -ENOMEM;
++				goto out;
++			}
++			len = min_t(unsigned, len,
++				    nr_pages * PAGE_SIZE - offset);
++			break;
++		}
++	}
++
++	if (offset && !PageUptodate(pages[0])) {
++		ret = bch2_read_single_page(pages[0], mapping);
++		if (ret)
++			goto out;
++	}
++
++	if ((pos + len) & (PAGE_SIZE - 1) &&
++	    !PageUptodate(pages[nr_pages - 1])) {
++		if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
++			zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
++		} else {
++			ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
++			if (ret)
++				goto out;
++		}
++	}
++
++	while (reserved < len) {
++		unsigned i = (offset + reserved) >> PAGE_SHIFT;
++		struct page *page = pages[i];
++		unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
++		unsigned pg_len = min_t(unsigned, len - reserved,
++					PAGE_SIZE - pg_offset);
++
++		if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
++			ret = bch2_page_state_set(c, inode_inum(inode),
++						  pages + i, nr_pages - i);
++			if (ret)
++				goto out;
++		}
++
++		ret = bch2_page_reservation_get(c, inode, page, &res,
++						pg_offset, pg_len, true);
++		if (ret)
++			goto out;
++
++		reserved += pg_len;
++	}
++
++	if (mapping_writably_mapped(mapping))
++		for (i = 0; i < nr_pages; i++)
++			flush_dcache_page(pages[i]);
++
++	while (copied < len) {
++		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
++		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
++		unsigned pg_len = min_t(unsigned, len - copied,
++					PAGE_SIZE - pg_offset);
++		unsigned pg_copied = copy_page_from_iter_atomic(page,
++						pg_offset, pg_len,iter);
++
++		if (!pg_copied)
++			break;
++
++		if (!PageUptodate(page) &&
++		    pg_copied != PAGE_SIZE &&
++		    pos + copied + pg_copied < inode->v.i_size) {
++			zero_user(page, 0, PAGE_SIZE);
++			break;
++		}
++
++		flush_dcache_page(page);
++		copied += pg_copied;
++
++		if (pg_copied != pg_len)
++			break;
++	}
++
++	if (!copied)
++		goto out;
++
++	spin_lock(&inode->v.i_lock);
++	if (pos + copied > inode->v.i_size)
++		i_size_write(&inode->v, pos + copied);
++	spin_unlock(&inode->v.i_lock);
++
++	while (set_dirty < copied) {
++		struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
++		unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
++		unsigned pg_len = min_t(unsigned, copied - set_dirty,
++					PAGE_SIZE - pg_offset);
++
++		if (!PageUptodate(page))
++			SetPageUptodate(page);
++
++		bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
++		unlock_page(page);
++		put_page(page);
++
++		set_dirty += pg_len;
++	}
++
++	nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
++	inode->ei_last_dirtied = (unsigned long) current;
++out:
++	for (i = nr_pages_copied; i < nr_pages; i++) {
++		unlock_page(pages[i]);
++		put_page(pages[i]);
++	}
++
++	bch2_page_reservation_put(c, inode, &res);
++
++	return copied ?: ret;
++}
++
++static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
++{
++	struct file *file = iocb->ki_filp;
++	struct address_space *mapping = file->f_mapping;
++	struct bch_inode_info *inode = file_bch_inode(file);
++	loff_t pos = iocb->ki_pos;
++	ssize_t written = 0;
++	int ret = 0;
++
++	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
++
++	do {
++		unsigned offset = pos & (PAGE_SIZE - 1);
++		unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
++			      PAGE_SIZE * WRITE_BATCH_PAGES - offset);
++again:
++		/*
++		 * Bring in the user page that we will copy from _first_.
++		 * Otherwise there's a nasty deadlock on copying from the
++		 * same page as we're writing to, without it being marked
++		 * up-to-date.
++		 *
++		 * Not only is this an optimisation, but it is also required
++		 * to check that the address is actually valid, when atomic
++		 * usercopies are used, below.
++		 */
++		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
++			bytes = min_t(unsigned long, iov_iter_count(iter),
++				      PAGE_SIZE - offset);
++
++			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
++				ret = -EFAULT;
++				break;
++			}
++		}
++
++		if (unlikely(fatal_signal_pending(current))) {
++			ret = -EINTR;
++			break;
++		}
++
++		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
++		if (unlikely(ret < 0))
++			break;
++
++		cond_resched();
++
++		if (unlikely(ret == 0)) {
++			/*
++			 * If we were unable to copy any data at all, we must
++			 * fall back to a single segment length write.
++			 *
++			 * If we didn't fallback here, we could livelock
++			 * because not all segments in the iov can be copied at
++			 * once without a pagefault.
++			 */
++			bytes = min_t(unsigned long, PAGE_SIZE - offset,
++				      iov_iter_single_seg_count(iter));
++			goto again;
++		}
++		pos += ret;
++		written += ret;
++		ret = 0;
++
++		balance_dirty_pages_ratelimited(mapping);
++	} while (iov_iter_count(iter));
++
++	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
++
++	return written ? written : ret;
++}
++
++/* O_DIRECT reads */
++
++static void bio_check_or_release(struct bio *bio, bool check_dirty)
++{
++	if (check_dirty) {
++		bio_check_pages_dirty(bio);
++	} else {
++		bio_release_pages(bio, false);
++		bio_put(bio);
++	}
++}
++
++static void bch2_dio_read_complete(struct closure *cl)
++{
++	struct dio_read *dio = container_of(cl, struct dio_read, cl);
++
++	dio->req->ki_complete(dio->req, dio->ret);
++	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
++}
++
++static void bch2_direct_IO_read_endio(struct bio *bio)
++{
++	struct dio_read *dio = bio->bi_private;
++
++	if (bio->bi_status)
++		dio->ret = blk_status_to_errno(bio->bi_status);
++
++	closure_put(&dio->cl);
++}
++
++static void bch2_direct_IO_read_split_endio(struct bio *bio)
++{
++	struct dio_read *dio = bio->bi_private;
++	bool should_dirty = dio->should_dirty;
++
++	bch2_direct_IO_read_endio(bio);
++	bio_check_or_release(bio, should_dirty);
++}
++
++static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
++{
++	struct file *file = req->ki_filp;
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
++	struct dio_read *dio;
++	struct bio *bio;
++	loff_t offset = req->ki_pos;
++	bool sync = is_sync_kiocb(req);
++	size_t shorten;
++	ssize_t ret;
++
++	if ((offset|iter->count) & (block_bytes(c) - 1))
++		return -EINVAL;
++
++	ret = min_t(loff_t, iter->count,
++		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
++
++	if (!ret)
++		return ret;
++
++	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
++	iter->count -= shorten;
++
++	bio = bio_alloc_bioset(NULL,
++			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
++			       REQ_OP_READ,
++			       GFP_KERNEL,
++			       &c->dio_read_bioset);
++
++	bio->bi_end_io = bch2_direct_IO_read_endio;
++
++	dio = container_of(bio, struct dio_read, rbio.bio);
++	closure_init(&dio->cl, NULL);
++
++	/*
++	 * this is a _really_ horrible hack just to avoid an atomic sub at the
++	 * end:
++	 */
++	if (!sync) {
++		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
++		atomic_set(&dio->cl.remaining,
++			   CLOSURE_REMAINING_INITIALIZER -
++			   CLOSURE_RUNNING +
++			   CLOSURE_DESTRUCTOR);
++	} else {
++		atomic_set(&dio->cl.remaining,
++			   CLOSURE_REMAINING_INITIALIZER + 1);
++	}
++
++	dio->req	= req;
++	dio->ret	= ret;
++	/*
++	 * This is one of the sketchier things I've encountered: we have to skip
++	 * the dirtying of requests that are internal from the kernel (i.e. from
++	 * loopback), because we'll deadlock on page_lock.
++	 */
++	dio->should_dirty = iter_is_iovec(iter);
++
++	goto start;
++	while (iter->count) {
++		bio = bio_alloc_bioset(NULL,
++				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
++				       REQ_OP_READ,
++				       GFP_KERNEL,
++				       &c->bio_read);
++		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
++start:
++		bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
++		bio->bi_iter.bi_sector	= offset >> 9;
++		bio->bi_private		= dio;
++
++		ret = bio_iov_iter_get_pages(bio, iter);
++		if (ret < 0) {
++			/* XXX: fault inject this path */
++			bio->bi_status = BLK_STS_RESOURCE;
++			bio_endio(bio);
++			break;
++		}
++
++		offset += bio->bi_iter.bi_size;
++
++		if (dio->should_dirty)
++			bio_set_pages_dirty(bio);
++
++		if (iter->count)
++			closure_get(&dio->cl);
++
++		bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
++	}
++
++	iter->count += shorten;
++
++	if (sync) {
++		closure_sync(&dio->cl);
++		closure_debug_destroy(&dio->cl);
++		ret = dio->ret;
++		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
++		return ret;
++	} else {
++		return -EIOCBQUEUED;
++	}
++}
++
++ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
++{
++	struct file *file = iocb->ki_filp;
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct address_space *mapping = file->f_mapping;
++	size_t count = iov_iter_count(iter);
++	ssize_t ret;
++
++	if (!count)
++		return 0; /* skip atime */
++
++	if (iocb->ki_flags & IOCB_DIRECT) {
++		struct blk_plug plug;
++
++		ret = filemap_write_and_wait_range(mapping,
++					iocb->ki_pos,
++					iocb->ki_pos + count - 1);
++		if (ret < 0)
++			return ret;
++
++		file_accessed(file);
++
++		blk_start_plug(&plug);
++		ret = bch2_direct_IO_read(iocb, iter);
++		blk_finish_plug(&plug);
++
++		if (ret >= 0)
++			iocb->ki_pos += ret;
++	} else {
++		bch2_pagecache_add_get(&inode->ei_pagecache_lock);
++		ret = generic_file_read_iter(iocb, iter);
++		bch2_pagecache_add_put(&inode->ei_pagecache_lock);
++	}
++
++	return ret;
++}
++
++/* O_DIRECT writes */
++
++static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
++				       u64 offset, u64 size,
++				       unsigned nr_replicas, bool compressed)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u64 end = offset + size;
++	u32 snapshot;
++	bool ret = true;
++	int err;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
++	if (err)
++		goto err;
++
++	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
++			   SPOS(inum.inum, offset, snapshot),
++			   BTREE_ITER_SLOTS, k, err) {
++		if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
++			break;
++
++		if (k.k->p.snapshot != snapshot ||
++		    nr_replicas > bch2_bkey_replicas(c, k) ||
++		    (!compressed && bch2_bkey_sectors_compressed(k))) {
++			ret = false;
++			break;
++		}
++	}
++
++	offset = iter.pos.offset;
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
++		goto retry;
++	bch2_trans_exit(&trans);
++
++	return err ? false : ret;
++}
++
++static void bch2_dio_write_loop_async(struct bch_write_op *);
++
++static long bch2_dio_write_loop(struct dio_write *dio)
++{
++	bool kthread = (current->flags & PF_KTHREAD) != 0;
++	struct kiocb *req = dio->req;
++	struct address_space *mapping = req->ki_filp->f_mapping;
++	struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bio *bio = &dio->op.wbio.bio;
++	struct bvec_iter_all iter;
++	struct bio_vec *bv;
++	unsigned unaligned, iter_count;
++	bool sync = dio->sync, dropped_locks;
++	long ret;
++
++	if (dio->loop)
++		goto loop;
++
++	down(&c->io_in_flight);
++
++	while (1) {
++		iter_count = dio->iter.count;
++
++		if (kthread && dio->mm)
++			kthread_use_mm(dio->mm);
++		BUG_ON(current->faults_disabled_mapping);
++		current->faults_disabled_mapping = mapping;
++
++		ret = bio_iov_iter_get_pages(bio, &dio->iter);
++
++		dropped_locks = fdm_dropped_locks();
++
++		current->faults_disabled_mapping = NULL;
++		if (kthread && dio->mm)
++			kthread_unuse_mm(dio->mm);
++
++		/*
++		 * If the fault handler returned an error but also signalled
++		 * that it dropped & retook ei_pagecache_lock, we just need to
++		 * re-shoot down the page cache and retry:
++		 */
++		if (dropped_locks && ret)
++			ret = 0;
++
++		if (unlikely(ret < 0))
++			goto err;
++
++		if (unlikely(dropped_locks)) {
++			ret = write_invalidate_inode_pages_range(mapping,
++					req->ki_pos,
++					req->ki_pos + iter_count - 1);
++			if (unlikely(ret))
++				goto err;
++
++			if (!bio->bi_iter.bi_size)
++				continue;
++		}
++
++		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
++		bio->bi_iter.bi_size -= unaligned;
++		iov_iter_revert(&dio->iter, unaligned);
++
++		if (!bio->bi_iter.bi_size) {
++			/*
++			 * bio_iov_iter_get_pages was only able to get <
++			 * blocksize worth of pages:
++			 */
++			ret = -EFAULT;
++			goto err;
++		}
++
++		bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
++		dio->op.end_io		= bch2_dio_write_loop_async;
++		dio->op.target		= dio->op.opts.foreground_target;
++		dio->op.write_point	= writepoint_hashed((unsigned long) current);
++		dio->op.nr_replicas	= dio->op.opts.data_replicas;
++		dio->op.subvol		= inode->ei_subvol;
++		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
++
++		if ((req->ki_flags & IOCB_DSYNC) &&
++		    !c->opts.journal_flush_disabled)
++			dio->op.flags |= BCH_WRITE_FLUSH;
++		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
++
++		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
++						dio->op.opts.data_replicas, 0);
++		if (unlikely(ret) &&
++		    !bch2_check_range_allocated(c, inode_inum(inode),
++				dio->op.pos.offset, bio_sectors(bio),
++				dio->op.opts.data_replicas,
++				dio->op.opts.compression != 0))
++			goto err;
++
++		task_io_account_write(bio->bi_iter.bi_size);
++
++		if (!dio->sync && !dio->loop && dio->iter.count) {
++			struct iovec *iov = dio->inline_vecs;
++
++			if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
++				iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
++					      GFP_KERNEL);
++				if (unlikely(!iov)) {
++					dio->sync = sync = true;
++					goto do_io;
++				}
++
++				dio->free_iov = true;
++			}
++
++			memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
++			dio->iter.iov = iov;
++		}
++do_io:
++		dio->loop = true;
++		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
++
++		if (sync)
++			wait_for_completion(&dio->done);
++		else
++			return -EIOCBQUEUED;
++loop:
++		i_sectors_acct(c, inode, &dio->quota_res,
++			       dio->op.i_sectors_delta);
++		req->ki_pos += (u64) dio->op.written << 9;
++		dio->written += dio->op.written;
++
++		spin_lock(&inode->v.i_lock);
++		if (req->ki_pos > inode->v.i_size)
++			i_size_write(&inode->v, req->ki_pos);
++		spin_unlock(&inode->v.i_lock);
++
++		if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
++			bio_for_each_segment_all(bv, bio, iter)
++				put_page(bv->bv_page);
++		bio->bi_vcnt = 0;
++
++		if (dio->op.error) {
++			set_bit(EI_INODE_ERROR, &inode->ei_flags);
++			break;
++		}
++
++		if (!dio->iter.count)
++			break;
++
++		bio_reset(bio, NULL, REQ_OP_WRITE);
++		reinit_completion(&dio->done);
++	}
++
++	ret = dio->op.error ?: ((long) dio->written << 9);
++err:
++	up(&c->io_in_flight);
++	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
++	bch2_quota_reservation_put(c, inode, &dio->quota_res);
++
++	if (dio->free_iov)
++		kfree(dio->iter.iov);
++
++	if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
++		bio_for_each_segment_all(bv, bio, iter)
++			put_page(bv->bv_page);
++	bio_put(bio);
++
++	/* inode->i_dio_count is our ref on inode and thus bch_fs */
++	inode_dio_end(&inode->v);
++
++	if (!sync) {
++		req->ki_complete(req, ret);
++		ret = -EIOCBQUEUED;
++	}
++	return ret;
++}
++
++static void bch2_dio_write_loop_async(struct bch_write_op *op)
++{
++	struct dio_write *dio = container_of(op, struct dio_write, op);
++
++	if (dio->sync)
++		complete(&dio->done);
++	else
++		bch2_dio_write_loop(dio);
++}
++
++static noinline
++ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
++{
++	struct file *file = req->ki_filp;
++	struct address_space *mapping = file->f_mapping;
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct dio_write *dio;
++	struct bio *bio;
++	bool locked = true, extending;
++	ssize_t ret;
++
++	prefetch(&c->opts);
++	prefetch((void *) &c->opts + 64);
++	prefetch(&inode->ei_inode);
++	prefetch((void *) &inode->ei_inode + 64);
++
++	inode_lock(&inode->v);
++
++	ret = generic_write_checks(req, iter);
++	if (unlikely(ret <= 0))
++		goto err;
++
++	ret = file_remove_privs(file);
++	if (unlikely(ret))
++		goto err;
++
++	ret = file_update_time(file);
++	if (unlikely(ret))
++		goto err;
++
++	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
++		goto err;
++
++	inode_dio_begin(&inode->v);
++	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
++
++	extending = req->ki_pos + iter->count > inode->v.i_size;
++	if (!extending) {
++		inode_unlock(&inode->v);
++		locked = false;
++	}
++
++	bio = bio_alloc_bioset(NULL,
++			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
++			       REQ_OP_WRITE,
++			       GFP_KERNEL,
++			       &c->dio_write_bioset);
++	dio = container_of(bio, struct dio_write, op.wbio.bio);
++	init_completion(&dio->done);
++	dio->req		= req;
++	dio->mm			= current->mm;
++	dio->loop		= false;
++	dio->sync		= is_sync_kiocb(req) || extending;
++	dio->free_iov		= false;
++	dio->quota_res.sectors	= 0;
++	dio->written		= 0;
++	dio->iter		= *iter;
++
++	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
++					 iter->count >> 9, true);
++	if (unlikely(ret))
++		goto err_put_bio;
++
++	ret = write_invalidate_inode_pages_range(mapping,
++					req->ki_pos,
++					req->ki_pos + iter->count - 1);
++	if (unlikely(ret))
++		goto err_put_bio;
++
++	ret = bch2_dio_write_loop(dio);
++err:
++	if (locked)
++		inode_unlock(&inode->v);
++	return ret;
++err_put_bio:
++	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
++	bch2_quota_reservation_put(c, inode, &dio->quota_res);
++	bio_put(bio);
++	inode_dio_end(&inode->v);
++	goto err;
++}
++
++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
++{
++	struct file *file = iocb->ki_filp;
++	struct bch_inode_info *inode = file_bch_inode(file);
++	ssize_t ret;
++
++	if (iocb->ki_flags & IOCB_DIRECT)
++		return bch2_direct_write(iocb, from);
++
++	/* We can write back this queue in page reclaim */
++	current->backing_dev_info = inode_to_bdi(&inode->v);
++	inode_lock(&inode->v);
++
++	ret = generic_write_checks(iocb, from);
++	if (ret <= 0)
++		goto unlock;
++
++	ret = file_remove_privs(file);
++	if (ret)
++		goto unlock;
++
++	ret = file_update_time(file);
++	if (ret)
++		goto unlock;
++
++	ret = bch2_buffered_write(iocb, from);
++	if (likely(ret > 0))
++		iocb->ki_pos += ret;
++unlock:
++	inode_unlock(&inode->v);
++	current->backing_dev_info = NULL;
++
++	if (ret > 0)
++		ret = generic_write_sync(iocb, ret);
++
++	return ret;
++}
++
++/* fsync: */
++
++/*
++ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
++ * insert trigger: look up the btree inode instead
++ */
++static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum)
++{
++	struct bch_inode_unpacked inode;
++	int ret;
++
++	if (c->opts.journal_flush_disabled)
++		return 0;
++
++	ret = bch2_inode_find_by_inum(c, inum, &inode);
++	if (ret)
++		return ret;
++
++	return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq);
++}
++
++int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
++{
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	int ret, ret2, ret3;
++
++	ret = file_write_and_wait_range(file, start, end);
++	ret2 = sync_inode_metadata(&inode->v, 1);
++	ret3 = bch2_flush_inode(c, inode_inum(inode));
++
++	return ret ?: ret2 ?: ret3;
++}
++
++/* truncate: */
++
++static inline int range_has_data(struct bch_fs *c, u32 subvol,
++				 struct bpos start,
++				 struct bpos end)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
++	if (ret)
++		goto err;
++
++	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
++		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
++			break;
++
++		if (bkey_extent_is_data(k.k)) {
++			ret = 1;
++			break;
++		}
++	}
++	start = iter.pos;
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int __bch2_truncate_page(struct bch_inode_info *inode,
++				pgoff_t index, loff_t start, loff_t end)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct address_space *mapping = inode->v.i_mapping;
++	struct bch_page_state *s;
++	unsigned start_offset = start & (PAGE_SIZE - 1);
++	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
++	unsigned i;
++	struct page *page;
++	s64 i_sectors_delta = 0;
++	int ret = 0;
++
++	/* Page boundary? Nothing to do */
++	if (!((index == start >> PAGE_SHIFT && start_offset) ||
++	      (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
++		return 0;
++
++	/* Above i_size? */
++	if (index << PAGE_SHIFT >= inode->v.i_size)
++		return 0;
++
++	page = find_lock_page(mapping, index);
++	if (!page) {
++		/*
++		 * XXX: we're doing two index lookups when we end up reading the
++		 * page
++		 */
++		ret = range_has_data(c, inode->ei_subvol,
++				POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT),
++				POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT));
++		if (ret <= 0)
++			return ret;
++
++		page = find_or_create_page(mapping, index, GFP_KERNEL);
++		if (unlikely(!page)) {
++			ret = -ENOMEM;
++			goto out;
++		}
++	}
++
++	s = bch2_page_state_create(page, 0);
++	if (!s) {
++		ret = -ENOMEM;
++		goto unlock;
++	}
++
++	if (!PageUptodate(page)) {
++		ret = bch2_read_single_page(page, mapping);
++		if (ret)
++			goto unlock;
++	}
++
++	if (index != start >> PAGE_SHIFT)
++		start_offset = 0;
++	if (index != end >> PAGE_SHIFT)
++		end_offset = PAGE_SIZE;
++
++	for (i = round_up(start_offset, block_bytes(c)) >> 9;
++	     i < round_down(end_offset, block_bytes(c)) >> 9;
++	     i++) {
++		s->s[i].nr_replicas	= 0;
++		if (s->s[i].state == SECTOR_DIRTY)
++			i_sectors_delta--;
++		s->s[i].state		= SECTOR_UNALLOCATED;
++	}
++
++	i_sectors_acct(c, inode, NULL, i_sectors_delta);
++
++	/*
++	 * Caller needs to know whether this page will be written out by
++	 * writeback - doing an i_size update if necessary - or whether it will
++	 * be responsible for the i_size update:
++	 */
++	ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
++			  PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
++
++	zero_user_segment(page, start_offset, end_offset);
++
++	/*
++	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
++	 *
++	 * XXX: because we aren't currently tracking whether the page has actual
++	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
++	 */
++	BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false));
++
++	/*
++	 * This removes any writeable userspace mappings; we need to force
++	 * .page_mkwrite to be called again before any mmapped writes, to
++	 * redirty the full page:
++	 */
++	page_mkclean(page);
++	__set_page_dirty_nobuffers(page);
++unlock:
++	unlock_page(page);
++	put_page(page);
++out:
++	return ret;
++}
++
++static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
++{
++	return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
++				    from, round_up(from, PAGE_SIZE));
++}
++
++static int bch2_truncate_pages(struct bch_inode_info *inode,
++			       loff_t start, loff_t end)
++{
++	int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT,
++				       start, end);
++
++	if (ret >= 0 &&
++	    start >> PAGE_SHIFT != end >> PAGE_SHIFT)
++		ret = __bch2_truncate_page(inode,
++					   end >> PAGE_SHIFT,
++					   start, end);
++	return ret;
++}
++
++static int bch2_extend(struct user_namespace *mnt_userns,
++		       struct bch_inode_info *inode,
++		       struct bch_inode_unpacked *inode_u,
++		       struct iattr *iattr)
++{
++	struct address_space *mapping = inode->v.i_mapping;
++	int ret;
++
++	/*
++	 * sync appends:
++	 *
++	 * this has to be done _before_ extending i_size:
++	 */
++	ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
++	if (ret)
++		return ret;
++
++	truncate_setsize(&inode->v, iattr->ia_size);
++
++	return bch2_setattr_nonsize(mnt_userns, inode, iattr);
++}
++
++static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
++				   struct bch_inode_unpacked *bi,
++				   void *p)
++{
++	bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
++	return 0;
++}
++
++static int bch2_truncate_start_fn(struct bch_inode_info *inode,
++				  struct bch_inode_unpacked *bi, void *p)
++{
++	u64 *new_i_size = p;
++
++	bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
++	bi->bi_size = *new_i_size;
++	return 0;
++}
++
++int bch2_truncate(struct user_namespace *mnt_userns,
++		  struct bch_inode_info *inode, struct iattr *iattr)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct address_space *mapping = inode->v.i_mapping;
++	struct bch_inode_unpacked inode_u;
++	u64 new_i_size = iattr->ia_size;
++	s64 i_sectors_delta = 0;
++	int ret = 0;
++
++	/*
++	 * If the truncate call with change the size of the file, the
++	 * cmtimes should be updated. If the size will not change, we
++	 * do not need to update the cmtimes.
++	 */
++	if (iattr->ia_size != inode->v.i_size) {
++		if (!(iattr->ia_valid & ATTR_MTIME))
++			ktime_get_coarse_real_ts64(&iattr->ia_mtime);
++		if (!(iattr->ia_valid & ATTR_CTIME))
++			ktime_get_coarse_real_ts64(&iattr->ia_ctime);
++		iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
++	}
++
++	inode_dio_wait(&inode->v);
++	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
++
++	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
++	if (ret)
++		goto err;
++
++	/*
++	 * check this before next assertion; on filesystem error our normal
++	 * invariants are a bit broken (truncate has to truncate the page cache
++	 * before the inode).
++	 */
++	ret = bch2_journal_error(&c->journal);
++	if (ret)
++		goto err;
++
++	WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
++		inode->v.i_size < inode_u.bi_size);
++
++	if (iattr->ia_size > inode->v.i_size) {
++		ret = bch2_extend(mnt_userns, inode, &inode_u, iattr);
++		goto err;
++	}
++
++	iattr->ia_valid &= ~ATTR_SIZE;
++
++	ret = bch2_truncate_page(inode, iattr->ia_size);
++	if (unlikely(ret < 0))
++		goto err;
++
++	/*
++	 * When extending, we're going to write the new i_size to disk
++	 * immediately so we need to flush anything above the current on disk
++	 * i_size first:
++	 *
++	 * Also, when extending we need to flush the page that i_size currently
++	 * straddles - if it's mapped to userspace, we need to ensure that
++	 * userspace has to redirty it and call .mkwrite -> set_page_dirty
++	 * again to allocate the part of the page that was extended.
++	 */
++	if (iattr->ia_size > inode_u.bi_size)
++		ret = filemap_write_and_wait_range(mapping,
++				inode_u.bi_size,
++				iattr->ia_size - 1);
++	else if (iattr->ia_size & (PAGE_SIZE - 1))
++		ret = filemap_write_and_wait_range(mapping,
++				round_down(iattr->ia_size, PAGE_SIZE),
++				iattr->ia_size - 1);
++	if (ret)
++		goto err;
++
++	mutex_lock(&inode->ei_update_lock);
++	ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
++			       &new_i_size, 0);
++	mutex_unlock(&inode->ei_update_lock);
++
++	if (unlikely(ret))
++		goto err;
++
++	truncate_setsize(&inode->v, iattr->ia_size);
++
++	ret = bch2_fpunch(c, inode_inum(inode),
++			round_up(iattr->ia_size, block_bytes(c)) >> 9,
++			U64_MAX, &i_sectors_delta);
++	i_sectors_acct(c, inode, NULL, i_sectors_delta);
++
++	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
++				!bch2_journal_error(&c->journal), c,
++				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
++				inode->v.i_ino, (u64) inode->v.i_blocks,
++				inode->ei_inode.bi_sectors);
++	if (unlikely(ret))
++		goto err;
++
++	mutex_lock(&inode->ei_update_lock);
++	ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
++	mutex_unlock(&inode->ei_update_lock);
++
++	ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
++err:
++	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
++	return ret;
++}
++
++/* fallocate: */
++
++static int inode_update_times_fn(struct bch_inode_info *inode,
++				 struct bch_inode_unpacked *bi, void *p)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++
++	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
++	return 0;
++}
++
++static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	u64 end		= offset + len;
++	u64 block_start	= round_up(offset, block_bytes(c));
++	u64 block_end	= round_down(end, block_bytes(c));
++	bool truncated_last_page;
++	int ret = 0;
++
++	ret = bch2_truncate_pages(inode, offset, end);
++	if (unlikely(ret < 0))
++		goto err;
++
++	truncated_last_page = ret;
++
++	truncate_pagecache_range(&inode->v, offset, end - 1);
++
++	if (block_start < block_end ) {
++		s64 i_sectors_delta = 0;
++
++		ret = bch2_fpunch(c, inode_inum(inode),
++				  block_start >> 9, block_end >> 9,
++				  &i_sectors_delta);
++		i_sectors_acct(c, inode, NULL, i_sectors_delta);
++	}
++
++	mutex_lock(&inode->ei_update_lock);
++	if (end >= inode->v.i_size && !truncated_last_page) {
++		ret = bch2_write_inode_size(c, inode, inode->v.i_size,
++					    ATTR_MTIME|ATTR_CTIME);
++	} else {
++		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
++				       ATTR_MTIME|ATTR_CTIME);
++	}
++	mutex_unlock(&inode->ei_update_lock);
++err:
++	return ret;
++}
++
++static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
++				   loff_t offset, loff_t len,
++				   bool insert)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct address_space *mapping = inode->v.i_mapping;
++	struct bkey_buf copy;
++	struct btree_trans trans;
++	struct btree_iter src, dst, del;
++	loff_t shift, new_size;
++	u64 src_start;
++	int ret = 0;
++
++	if ((offset | len) & (block_bytes(c) - 1))
++		return -EINVAL;
++
++	if (insert) {
++		if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
++			return -EFBIG;
++
++		if (offset >= inode->v.i_size)
++			return -EINVAL;
++
++		src_start	= U64_MAX;
++		shift		= len;
++	} else {
++		if (offset + len >= inode->v.i_size)
++			return -EINVAL;
++
++		src_start	= offset + len;
++		shift		= -len;
++	}
++
++	new_size = inode->v.i_size + shift;
++
++	ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
++	if (ret)
++		return ret;
++
++	if (insert) {
++		i_size_write(&inode->v, new_size);
++		mutex_lock(&inode->ei_update_lock);
++		ret = bch2_write_inode_size(c, inode, new_size,
++					    ATTR_MTIME|ATTR_CTIME);
++		mutex_unlock(&inode->ei_update_lock);
++	} else {
++		s64 i_sectors_delta = 0;
++
++		ret = bch2_fpunch(c, inode_inum(inode),
++				  offset >> 9, (offset + len) >> 9,
++				  &i_sectors_delta);
++		i_sectors_acct(c, inode, NULL, i_sectors_delta);
++
++		if (ret)
++			return ret;
++	}
++
++	bch2_bkey_buf_init(&copy);
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
++	bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
++			POS(inode->v.i_ino, src_start >> 9),
++			BTREE_ITER_INTENT);
++	bch2_trans_copy_iter(&dst, &src);
++	bch2_trans_copy_iter(&del, &src);
++
++	while (ret == 0 ||
++	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
++		struct disk_reservation disk_res =
++			bch2_disk_reservation_init(c, 0);
++		struct bkey_i delete;
++		struct bkey_s_c k;
++		struct bpos next_pos;
++		struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
++		struct bpos atomic_end;
++		unsigned trigger_flags = 0;
++		u32 snapshot;
++
++		bch2_trans_begin(&trans);
++
++		ret = bch2_subvolume_get_snapshot(&trans,
++					inode->ei_subvol, &snapshot);
++		if (ret)
++			continue;
++
++		bch2_btree_iter_set_snapshot(&src, snapshot);
++		bch2_btree_iter_set_snapshot(&dst, snapshot);
++		bch2_btree_iter_set_snapshot(&del, snapshot);
++
++		bch2_trans_begin(&trans);
++
++		k = insert
++			? bch2_btree_iter_peek_prev(&src)
++			: bch2_btree_iter_peek(&src);
++		if ((ret = bkey_err(k)))
++			continue;
++
++		if (!k.k || k.k->p.inode != inode->v.i_ino)
++			break;
++
++		if (insert &&
++		    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
++			break;
++reassemble:
++		bch2_bkey_buf_reassemble(&copy, c, k);
++
++		if (insert &&
++		    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
++			bch2_cut_front(move_pos, copy.k);
++
++		copy.k->k.p.offset += shift >> 9;
++		bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
++
++		ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
++		if (ret)
++			continue;
++
++		if (bkey_cmp(atomic_end, copy.k->k.p)) {
++			if (insert) {
++				move_pos = atomic_end;
++				move_pos.offset -= shift >> 9;
++				goto reassemble;
++			} else {
++				bch2_cut_back(atomic_end, copy.k);
++			}
++		}
++
++		bkey_init(&delete.k);
++		delete.k.p = copy.k->k.p;
++		delete.k.size = copy.k->k.size;
++		delete.k.p.offset -= shift >> 9;
++		bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
++
++		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
++
++		if (copy.k->k.size != k.k->size) {
++			/* We might end up splitting compressed extents: */
++			unsigned nr_ptrs =
++				bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
++
++			ret = bch2_disk_reservation_get(c, &disk_res,
++					copy.k->k.size, nr_ptrs,
++					BCH_DISK_RESERVATION_NOFAIL);
++			BUG_ON(ret);
++		}
++
++		ret =   bch2_btree_iter_traverse(&del) ?:
++			bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
++			bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
++			bch2_trans_commit(&trans, &disk_res, NULL,
++					  BTREE_INSERT_NOFAIL);
++		bch2_disk_reservation_put(c, &disk_res);
++
++		if (!ret)
++			bch2_btree_iter_set_pos(&src, next_pos);
++	}
++	bch2_trans_iter_exit(&trans, &del);
++	bch2_trans_iter_exit(&trans, &dst);
++	bch2_trans_iter_exit(&trans, &src);
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&copy, c);
++
++	if (ret)
++		return ret;
++
++	mutex_lock(&inode->ei_update_lock);
++	if (!insert) {
++		i_size_write(&inode->v, new_size);
++		ret = bch2_write_inode_size(c, inode, new_size,
++					    ATTR_MTIME|ATTR_CTIME);
++	} else {
++		/* We need an inode update to update bi_journal_seq for fsync: */
++		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
++				       ATTR_MTIME|ATTR_CTIME);
++	}
++	mutex_unlock(&inode->ei_update_lock);
++	return ret;
++}
++
++static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
++			     u64 start_sector, u64 end_sector)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
++	unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
++
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
++			POS(inode->v.i_ino, start_sector),
++			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
++
++	while (!ret && bkey_cmp(iter.pos, end_pos) < 0) {
++		s64 i_sectors_delta = 0;
++		struct disk_reservation disk_res = { 0 };
++		struct quota_res quota_res = { 0 };
++		struct bkey_i_reservation reservation;
++		struct bkey_s_c k;
++		unsigned sectors;
++		u32 snapshot;
++
++		bch2_trans_begin(&trans);
++
++		ret = bch2_subvolume_get_snapshot(&trans,
++					inode->ei_subvol, &snapshot);
++		if (ret)
++			goto bkey_err;
++
++		bch2_btree_iter_set_snapshot(&iter, snapshot);
++
++		k = bch2_btree_iter_peek_slot(&iter);
++		if ((ret = bkey_err(k)))
++			goto bkey_err;
++
++		/* already reserved */
++		if (k.k->type == KEY_TYPE_reservation &&
++		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
++			bch2_btree_iter_advance(&iter);
++			continue;
++		}
++
++		if (bkey_extent_is_data(k.k) &&
++		    !(mode & FALLOC_FL_ZERO_RANGE)) {
++			bch2_btree_iter_advance(&iter);
++			continue;
++		}
++
++		bkey_reservation_init(&reservation.k_i);
++		reservation.k.type	= KEY_TYPE_reservation;
++		reservation.k.p		= k.k->p;
++		reservation.k.size	= k.k->size;
++
++		bch2_cut_front(iter.pos,	&reservation.k_i);
++		bch2_cut_back(end_pos,		&reservation.k_i);
++
++		sectors = reservation.k.size;
++		reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k);
++
++		if (!bkey_extent_is_allocation(k.k)) {
++			ret = bch2_quota_reservation_add(c, inode,
++					&quota_res,
++					sectors, true);
++			if (unlikely(ret))
++				goto bkey_err;
++		}
++
++		if (reservation.v.nr_replicas < replicas ||
++		    bch2_bkey_sectors_compressed(k)) {
++			ret = bch2_disk_reservation_get(c, &disk_res, sectors,
++							replicas, 0);
++			if (unlikely(ret))
++				goto bkey_err;
++
++			reservation.v.nr_replicas = disk_res.nr_replicas;
++		}
++
++		ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
++					 &reservation.k_i,
++				&disk_res, NULL,
++				0, &i_sectors_delta, true);
++		if (ret)
++			goto bkey_err;
++		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
++bkey_err:
++		bch2_quota_reservation_put(c, inode, &quota_res);
++		bch2_disk_reservation_put(c, &disk_res);
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			ret = 0;
++	}
++
++	bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
++	mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
++
++	if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
++		struct quota_res quota_res = { 0 };
++		s64 i_sectors_delta = 0;
++
++		bch2_fpunch_at(&trans, &iter, inode_inum(inode),
++			       end_sector, &i_sectors_delta);
++		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
++		bch2_quota_reservation_put(c, inode, &quota_res);
++	}
++
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
++			    loff_t offset, loff_t len)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	u64 end		= offset + len;
++	u64 block_start	= round_down(offset,	block_bytes(c));
++	u64 block_end	= round_up(end,		block_bytes(c));
++	bool truncated_last_page = false;
++	int ret, ret2 = 0;
++
++	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
++		ret = inode_newsize_ok(&inode->v, end);
++		if (ret)
++			return ret;
++	}
++
++	if (mode & FALLOC_FL_ZERO_RANGE) {
++		ret = bch2_truncate_pages(inode, offset, end);
++		if (unlikely(ret < 0))
++			return ret;
++
++		truncated_last_page = ret;
++
++		truncate_pagecache_range(&inode->v, offset, end - 1);
++
++		block_start	= round_up(offset,	block_bytes(c));
++		block_end	= round_down(end,	block_bytes(c));
++	}
++
++	ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
++
++	/*
++	 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
++	 * so that the VFS cache i_size is consistent with the btree i_size:
++	 */
++	if (ret &&
++	    !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
++		return ret;
++
++	if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
++		end = inode->v.i_size;
++
++	if (end >= inode->v.i_size &&
++	    (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
++	     !(mode & FALLOC_FL_KEEP_SIZE))) {
++		spin_lock(&inode->v.i_lock);
++		i_size_write(&inode->v, end);
++		spin_unlock(&inode->v.i_lock);
++
++		mutex_lock(&inode->ei_update_lock);
++		ret2 = bch2_write_inode_size(c, inode, end, 0);
++		mutex_unlock(&inode->ei_update_lock);
++	}
++
++	return ret ?: ret2;
++}
++
++long bch2_fallocate_dispatch(struct file *file, int mode,
++			     loff_t offset, loff_t len)
++{
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	long ret;
++
++	if (!percpu_ref_tryget_live(&c->writes))
++		return -EROFS;
++
++	inode_lock(&inode->v);
++	inode_dio_wait(&inode->v);
++	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
++
++	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
++		ret = bchfs_fallocate(inode, mode, offset, len);
++	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
++		ret = bchfs_fpunch(inode, offset, len);
++	else if (mode == FALLOC_FL_INSERT_RANGE)
++		ret = bchfs_fcollapse_finsert(inode, offset, len, true);
++	else if (mode == FALLOC_FL_COLLAPSE_RANGE)
++		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
++	else
++		ret = -EOPNOTSUPP;
++
++
++	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
++	inode_unlock(&inode->v);
++	percpu_ref_put(&c->writes);
++
++	return ret;
++}
++
++loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
++			     struct file *file_dst, loff_t pos_dst,
++			     loff_t len, unsigned remap_flags)
++{
++	struct bch_inode_info *src = file_bch_inode(file_src);
++	struct bch_inode_info *dst = file_bch_inode(file_dst);
++	struct bch_fs *c = src->v.i_sb->s_fs_info;
++	s64 i_sectors_delta = 0;
++	u64 aligned_len;
++	loff_t ret = 0;
++
++	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
++		return -EINVAL;
++
++	if (remap_flags & REMAP_FILE_DEDUP)
++		return -EOPNOTSUPP;
++
++	if ((pos_src & (block_bytes(c) - 1)) ||
++	    (pos_dst & (block_bytes(c) - 1)))
++		return -EINVAL;
++
++	if (src == dst &&
++	    abs(pos_src - pos_dst) < len)
++		return -EINVAL;
++
++	bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
++
++	file_update_time(file_dst);
++
++	inode_dio_wait(&src->v);
++	inode_dio_wait(&dst->v);
++
++	ret = generic_remap_file_range_prep(file_src, pos_src,
++					    file_dst, pos_dst,
++					    &len, remap_flags);
++	if (ret < 0 || len == 0)
++		goto err;
++
++	aligned_len = round_up((u64) len, block_bytes(c));
++
++	ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
++				pos_dst, pos_dst + len - 1);
++	if (ret)
++		goto err;
++
++	mark_pagecache_unallocated(src, pos_src >> 9,
++				   (pos_src + aligned_len) >> 9);
++
++	ret = bch2_remap_range(c,
++			       inode_inum(dst), pos_dst >> 9,
++			       inode_inum(src), pos_src >> 9,
++			       aligned_len >> 9,
++			       pos_dst + len, &i_sectors_delta);
++	if (ret < 0)
++		goto err;
++
++	/*
++	 * due to alignment, we might have remapped slightly more than requsted
++	 */
++	ret = min((u64) ret << 9, (u64) len);
++
++	/* XXX get a quota reservation */
++	i_sectors_acct(c, dst, NULL, i_sectors_delta);
++
++	spin_lock(&dst->v.i_lock);
++	if (pos_dst + ret > dst->v.i_size)
++		i_size_write(&dst->v, pos_dst + ret);
++	spin_unlock(&dst->v.i_lock);
++
++	if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
++	    IS_SYNC(file_inode(file_dst)))
++		ret = bch2_flush_inode(c, inode_inum(dst));
++err:
++	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
++
++	return ret;
++}
++
++/* fseek: */
++
++static int page_data_offset(struct page *page, unsigned offset)
++{
++	struct bch_page_state *s = bch2_page_state(page);
++	unsigned i;
++
++	if (s)
++		for (i = offset >> 9; i < PAGE_SECTORS; i++)
++			if (s->s[i].state >= SECTOR_DIRTY)
++				return i << 9;
++
++	return -1;
++}
++
++static loff_t bch2_seek_pagecache_data(struct inode *vinode,
++				       loff_t start_offset,
++				       loff_t end_offset)
++{
++	struct address_space *mapping = vinode->i_mapping;
++	struct page *page;
++	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
++	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
++	pgoff_t index		= start_index;
++	loff_t ret;
++	int offset;
++
++	while (index <= end_index) {
++		if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
++			lock_page(page);
++
++			offset = page_data_offset(page,
++					page->index == start_index
++					? start_offset & (PAGE_SIZE - 1)
++					: 0);
++			if (offset >= 0) {
++				ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
++					    offset,
++					    start_offset, end_offset);
++				unlock_page(page);
++				put_page(page);
++				return ret;
++			}
++
++			unlock_page(page);
++			put_page(page);
++		} else {
++			break;
++		}
++	}
++
++	return end_offset;
++}
++
++static loff_t bch2_seek_data(struct file *file, u64 offset)
++{
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	subvol_inum inum = inode_inum(inode);
++	u64 isize, next_data = MAX_LFS_FILESIZE;
++	u32 snapshot;
++	int ret;
++
++	isize = i_size_read(&inode->v);
++	if (offset >= isize)
++		return -ENXIO;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
++			   SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
++		if (k.k->p.inode != inode->v.i_ino) {
++			break;
++		} else if (bkey_extent_is_data(k.k)) {
++			next_data = max(offset, bkey_start_offset(k.k) << 9);
++			break;
++		} else if (k.k->p.offset >> 9 > isize)
++			break;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	bch2_trans_exit(&trans);
++	if (ret)
++		return ret;
++
++	if (next_data > offset)
++		next_data = bch2_seek_pagecache_data(&inode->v,
++						     offset, next_data);
++
++	if (next_data >= isize)
++		return -ENXIO;
++
++	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
++}
++
++static int __page_hole_offset(struct page *page, unsigned offset)
++{
++	struct bch_page_state *s = bch2_page_state(page);
++	unsigned i;
++
++	if (!s)
++		return 0;
++
++	for (i = offset >> 9; i < PAGE_SECTORS; i++)
++		if (s->s[i].state < SECTOR_DIRTY)
++			return i << 9;
++
++	return -1;
++}
++
++static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
++{
++	pgoff_t index = offset >> PAGE_SHIFT;
++	struct page *page;
++	int pg_offset;
++	loff_t ret = -1;
++
++	page = find_lock_page(mapping, index);
++	if (!page)
++		return offset;
++
++	pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
++	if (pg_offset >= 0)
++		ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
++
++	unlock_page(page);
++
++	return ret;
++}
++
++static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
++				       loff_t start_offset,
++				       loff_t end_offset)
++{
++	struct address_space *mapping = vinode->i_mapping;
++	loff_t offset = start_offset, hole;
++
++	while (offset < end_offset) {
++		hole = page_hole_offset(mapping, offset);
++		if (hole >= 0 && hole <= end_offset)
++			return max(start_offset, hole);
++
++		offset += PAGE_SIZE;
++		offset &= PAGE_MASK;
++	}
++
++	return end_offset;
++}
++
++static loff_t bch2_seek_hole(struct file *file, u64 offset)
++{
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	subvol_inum inum = inode_inum(inode);
++	u64 isize, next_hole = MAX_LFS_FILESIZE;
++	u32 snapshot;
++	int ret;
++
++	isize = i_size_read(&inode->v);
++	if (offset >= isize)
++		return -ENXIO;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
++			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
++			   BTREE_ITER_SLOTS, k, ret) {
++		if (k.k->p.inode != inode->v.i_ino) {
++			next_hole = bch2_seek_pagecache_hole(&inode->v,
++					offset, MAX_LFS_FILESIZE);
++			break;
++		} else if (!bkey_extent_is_data(k.k)) {
++			next_hole = bch2_seek_pagecache_hole(&inode->v,
++					max(offset, bkey_start_offset(k.k) << 9),
++					k.k->p.offset << 9);
++
++			if (next_hole < k.k->p.offset << 9)
++				break;
++		} else {
++			offset = max(offset, bkey_start_offset(k.k) << 9);
++		}
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	bch2_trans_exit(&trans);
++	if (ret)
++		return ret;
++
++	if (next_hole > isize)
++		next_hole = isize;
++
++	return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
++}
++
++loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
++{
++	switch (whence) {
++	case SEEK_SET:
++	case SEEK_CUR:
++	case SEEK_END:
++		return generic_file_llseek(file, offset, whence);
++	case SEEK_DATA:
++		return bch2_seek_data(file, offset);
++	case SEEK_HOLE:
++		return bch2_seek_hole(file, offset);
++	}
++
++	return -EINVAL;
++}
++
++void bch2_fs_fsio_exit(struct bch_fs *c)
++{
++	bioset_exit(&c->dio_write_bioset);
++	bioset_exit(&c->dio_read_bioset);
++	bioset_exit(&c->writepage_bioset);
++}
++
++int bch2_fs_fsio_init(struct bch_fs *c)
++{
++	int ret = 0;
++
++	pr_verbose_init(c->opts, "");
++
++	if (bioset_init(&c->writepage_bioset,
++			4, offsetof(struct bch_writepage_io, op.wbio.bio),
++			BIOSET_NEED_BVECS) ||
++	    bioset_init(&c->dio_read_bioset,
++			4, offsetof(struct dio_read, rbio.bio),
++			BIOSET_NEED_BVECS) ||
++	    bioset_init(&c->dio_write_bioset,
++			4, offsetof(struct dio_write, op.wbio.bio),
++			BIOSET_NEED_BVECS))
++		ret = -ENOMEM;
++
++	pr_verbose_init(c->opts, "ret %i", ret);
++	return ret;
++}
++
++#endif /* NO_BCACHEFS_FS */
+diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
+new file mode 100644
+index 000000000000..7f2d7f454be4
+--- /dev/null
++++ b/fs/bcachefs/fs-io.h
+@@ -0,0 +1,56 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FS_IO_H
++#define _BCACHEFS_FS_IO_H
++
++#ifndef NO_BCACHEFS_FS
++
++#include "buckets.h"
++#include "io_types.h"
++
++#include <linux/uio.h>
++
++struct quota_res;
++
++int __must_check bch2_write_inode_size(struct bch_fs *,
++				       struct bch_inode_info *,
++				       loff_t, unsigned);
++
++int bch2_readpage(struct file *, struct page *);
++
++int bch2_writepages(struct address_space *, struct writeback_control *);
++void bch2_readahead(struct readahead_control *);
++
++int bch2_write_begin(struct file *, struct address_space *, loff_t,
++		     unsigned, unsigned, struct page **, void **);
++int bch2_write_end(struct file *, struct address_space *, loff_t,
++		   unsigned, unsigned, struct page *, void *);
++
++ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
++ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
++
++int bch2_fsync(struct file *, loff_t, loff_t, int);
++
++int bch2_truncate(struct user_namespace *,
++		  struct bch_inode_info *, struct iattr *);
++long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
++
++loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
++			     loff_t, loff_t, unsigned);
++
++loff_t bch2_llseek(struct file *, loff_t, int);
++
++vm_fault_t bch2_page_fault(struct vm_fault *);
++vm_fault_t bch2_page_mkwrite(struct vm_fault *);
++void bch2_invalidate_folio(struct folio *, size_t, size_t);
++int bch2_releasepage(struct page *, gfp_t);
++int bch2_migrate_page(struct address_space *, struct page *,
++		      struct page *, enum migrate_mode);
++
++void bch2_fs_fsio_exit(struct bch_fs *);
++int bch2_fs_fsio_init(struct bch_fs *);
++#else
++static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
++static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
++#endif
++
++#endif /* _BCACHEFS_FS_IO_H */
+diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
+new file mode 100644
+index 000000000000..9f329a624c12
+--- /dev/null
++++ b/fs/bcachefs/fs-ioctl.c
+@@ -0,0 +1,523 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifndef NO_BCACHEFS_FS
++
++#include "bcachefs.h"
++#include "chardev.h"
++#include "dirent.h"
++#include "fs.h"
++#include "fs-common.h"
++#include "fs-ioctl.h"
++#include "quota.h"
++
++#include <linux/compat.h>
++#include <linux/fsnotify.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/security.h>
++#include <linux/writeback.h>
++
++#define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
++#define FSOP_GOING_FLAGS_DEFAULT	0x0	/* going down */
++#define FSOP_GOING_FLAGS_LOGFLUSH	0x1	/* flush log but not data */
++#define FSOP_GOING_FLAGS_NOLOGFLUSH	0x2	/* don't flush log nor data */
++
++struct flags_set {
++	unsigned		mask;
++	unsigned		flags;
++
++	unsigned		projid;
++};
++
++static int bch2_inode_flags_set(struct bch_inode_info *inode,
++				struct bch_inode_unpacked *bi,
++				void *p)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	/*
++	 * We're relying on btree locking here for exclusion with other ioctl
++	 * calls - use the flags in the btree (@bi), not inode->i_flags:
++	 */
++	struct flags_set *s = p;
++	unsigned newflags = s->flags;
++	unsigned oldflags = bi->bi_flags & s->mask;
++
++	if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
++	    !capable(CAP_LINUX_IMMUTABLE))
++		return -EPERM;
++
++	if (!S_ISREG(bi->bi_mode) &&
++	    !S_ISDIR(bi->bi_mode) &&
++	    (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
++		return -EINVAL;
++
++	bi->bi_flags &= ~s->mask;
++	bi->bi_flags |= newflags;
++
++	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
++	return 0;
++}
++
++static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
++{
++	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
++
++	return put_user(flags, arg);
++}
++
++static int bch2_ioc_setflags(struct bch_fs *c,
++			     struct file *file,
++			     struct bch_inode_info *inode,
++			     void __user *arg)
++{
++	struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
++	unsigned uflags;
++	int ret;
++
++	if (get_user(uflags, (int __user *) arg))
++		return -EFAULT;
++
++	s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
++	if (uflags)
++		return -EOPNOTSUPP;
++
++	ret = mnt_want_write_file(file);
++	if (ret)
++		return ret;
++
++	inode_lock(&inode->v);
++	if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) {
++		ret = -EACCES;
++		goto setflags_out;
++	}
++
++	mutex_lock(&inode->ei_update_lock);
++	ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
++			       ATTR_CTIME);
++	mutex_unlock(&inode->ei_update_lock);
++
++setflags_out:
++	inode_unlock(&inode->v);
++	mnt_drop_write_file(file);
++	return ret;
++}
++
++static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
++			       struct fsxattr __user *arg)
++{
++	struct fsxattr fa = { 0 };
++
++	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
++	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
++
++	return copy_to_user(arg, &fa, sizeof(fa));
++}
++
++static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
++				      struct bch_inode_unpacked *bi,
++				      void *p)
++{
++	struct flags_set *s = p;
++
++	if (s->projid != bi->bi_project) {
++		bi->bi_fields_set |= 1U << Inode_opt_project;
++		bi->bi_project = s->projid;
++	}
++
++	return bch2_inode_flags_set(inode, bi, p);
++}
++
++static int bch2_ioc_fssetxattr(struct bch_fs *c,
++			       struct file *file,
++			       struct bch_inode_info *inode,
++			       struct fsxattr __user *arg)
++{
++	struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
++	struct fsxattr fa;
++	int ret;
++
++	if (copy_from_user(&fa, arg, sizeof(fa)))
++		return -EFAULT;
++
++	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
++	if (fa.fsx_xflags)
++		return -EOPNOTSUPP;
++
++	if (fa.fsx_projid >= U32_MAX)
++		return -EINVAL;
++
++	/*
++	 * inode fields accessible via the xattr interface are stored with a +1
++	 * bias, so that 0 means unset:
++	 */
++	s.projid = fa.fsx_projid + 1;
++
++	ret = mnt_want_write_file(file);
++	if (ret)
++		return ret;
++
++	inode_lock(&inode->v);
++	if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) {
++		ret = -EACCES;
++		goto err;
++	}
++
++	mutex_lock(&inode->ei_update_lock);
++	ret = bch2_set_projid(c, inode, fa.fsx_projid);
++	if (ret)
++		goto err_unlock;
++
++	ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
++			       ATTR_CTIME);
++err_unlock:
++	mutex_unlock(&inode->ei_update_lock);
++err:
++	inode_unlock(&inode->v);
++	mnt_drop_write_file(file);
++	return ret;
++}
++
++static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
++				   struct bch_inode_unpacked *bi,
++				   void *p)
++{
++	struct bch_inode_info *dir = p;
++
++	return !bch2_reinherit_attrs(bi, &dir->ei_inode);
++}
++
++static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
++				    struct file *file,
++				    struct bch_inode_info *src,
++				    const char __user *name)
++{
++	struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
++	struct bch_inode_info *dst;
++	struct inode *vinode = NULL;
++	char *kname = NULL;
++	struct qstr qstr;
++	int ret = 0;
++	subvol_inum inum;
++
++	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
++	if (!kname)
++		return -ENOMEM;
++
++	ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
++	if (unlikely(ret < 0))
++		goto err1;
++
++	qstr.len	= ret;
++	qstr.name	= kname;
++
++	ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
++	if (ret)
++		goto err1;
++
++	vinode = bch2_vfs_inode_get(c, inum);
++	ret = PTR_ERR_OR_ZERO(vinode);
++	if (ret)
++		goto err1;
++
++	dst = to_bch_ei(vinode);
++
++	ret = mnt_want_write_file(file);
++	if (ret)
++		goto err2;
++
++	bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
++
++	if (inode_attr_changing(src, dst, Inode_opt_project)) {
++		ret = bch2_fs_quota_transfer(c, dst,
++					     src->ei_qid,
++					     1 << QTYP_PRJ,
++					     KEY_TYPE_QUOTA_PREALLOC);
++		if (ret)
++			goto err3;
++	}
++
++	ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
++err3:
++	bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
++
++	/* return true if we did work */
++	if (ret >= 0)
++		ret = !ret;
++
++	mnt_drop_write_file(file);
++err2:
++	iput(vinode);
++err1:
++	kfree(kname);
++
++	return ret;
++}
++
++static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
++{
++	u32 flags;
++	int ret = 0;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if (get_user(flags, arg))
++		return -EFAULT;
++
++	bch_notice(c, "shutdown by ioctl type %u", flags);
++
++	down_write(&c->vfs_sb->s_umount);
++
++	switch (flags) {
++	case FSOP_GOING_FLAGS_DEFAULT:
++		ret = freeze_bdev(c->vfs_sb->s_bdev);
++		if (ret)
++			goto err;
++
++		bch2_journal_flush(&c->journal);
++		c->vfs_sb->s_flags |= SB_RDONLY;
++		bch2_fs_emergency_read_only(c);
++		thaw_bdev(c->vfs_sb->s_bdev);
++		break;
++
++	case FSOP_GOING_FLAGS_LOGFLUSH:
++		bch2_journal_flush(&c->journal);
++		fallthrough;
++
++	case FSOP_GOING_FLAGS_NOLOGFLUSH:
++		c->vfs_sb->s_flags |= SB_RDONLY;
++		bch2_fs_emergency_read_only(c);
++		break;
++	default:
++		ret = -EINVAL;
++		break;
++	}
++err:
++	up_write(&c->vfs_sb->s_umount);
++	return ret;
++}
++
++static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
++				struct bch_ioctl_subvolume arg)
++{
++	struct inode *dir;
++	struct bch_inode_info *inode;
++	struct user_namespace *s_user_ns;
++	struct dentry *dst_dentry;
++	struct path src_path, dst_path;
++	int how = LOOKUP_FOLLOW;
++	int error;
++	subvol_inum snapshot_src = { 0 };
++	unsigned lookup_flags = 0;
++	unsigned create_flags = BCH_CREATE_SUBVOL;
++
++	if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
++			  BCH_SUBVOL_SNAPSHOT_RO))
++		return -EINVAL;
++
++	if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
++	    (arg.src_ptr ||
++	     (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
++		return -EINVAL;
++
++	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
++		create_flags |= BCH_CREATE_SNAPSHOT;
++
++	if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
++		create_flags |= BCH_CREATE_SNAPSHOT_RO;
++
++	/* why do we need this lock? */
++	down_read(&c->vfs_sb->s_umount);
++
++	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
++		sync_inodes_sb(c->vfs_sb);
++retry:
++	if (arg.src_ptr) {
++		error = user_path_at(arg.dirfd,
++				(const char __user *)(unsigned long)arg.src_ptr,
++				how, &src_path);
++		if (error)
++			goto err1;
++
++		if (src_path.dentry->d_sb->s_fs_info != c) {
++			path_put(&src_path);
++			error = -EXDEV;
++			goto err1;
++		}
++
++		snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
++	}
++
++	dst_dentry = user_path_create(arg.dirfd,
++			(const char __user *)(unsigned long)arg.dst_ptr,
++			&dst_path, lookup_flags);
++	error = PTR_ERR_OR_ZERO(dst_dentry);
++	if (error)
++		goto err2;
++
++	if (dst_dentry->d_sb->s_fs_info != c) {
++		error = -EXDEV;
++		goto err3;
++	}
++
++	if (dst_dentry->d_inode) {
++		error = -EEXIST;
++		goto err3;
++	}
++
++	dir = dst_path.dentry->d_inode;
++	if (IS_DEADDIR(dir)) {
++		error = -ENOENT;
++		goto err3;
++	}
++
++	s_user_ns = dir->i_sb->s_user_ns;
++	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
++	    !kgid_has_mapping(s_user_ns, current_fsgid())) {
++		error = -EOVERFLOW;
++		goto err3;
++	}
++
++	error = inode_permission(file_mnt_user_ns(filp),
++				 dir, MAY_WRITE | MAY_EXEC);
++	if (error)
++		goto err3;
++
++	if (!IS_POSIXACL(dir))
++		arg.mode &= ~current_umask();
++
++	error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
++	if (error)
++		goto err3;
++
++	if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
++	    !arg.src_ptr)
++		snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
++
++	inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir),
++			      dst_dentry, arg.mode|S_IFDIR,
++			      0, snapshot_src, create_flags);
++	error = PTR_ERR_OR_ZERO(inode);
++	if (error)
++		goto err3;
++
++	d_instantiate(dst_dentry, &inode->v);
++	fsnotify_mkdir(dir, dst_dentry);
++err3:
++	done_path_create(&dst_path, dst_dentry);
++err2:
++	if (arg.src_ptr)
++		path_put(&src_path);
++
++	if (retry_estale(error, lookup_flags)) {
++		lookup_flags |= LOOKUP_REVAL;
++		goto retry;
++	}
++err1:
++	up_read(&c->vfs_sb->s_umount);
++
++	return error;
++}
++
++static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
++				struct bch_ioctl_subvolume arg)
++{
++	struct path path;
++	struct inode *dir;
++	int ret = 0;
++
++	if (arg.flags)
++		return -EINVAL;
++
++	ret = user_path_at(arg.dirfd,
++			(const char __user *)(unsigned long)arg.dst_ptr,
++			LOOKUP_FOLLOW, &path);
++	if (ret)
++		return ret;
++
++	if (path.dentry->d_sb->s_fs_info != c) {
++		path_put(&path);
++		return -EXDEV;
++	}
++
++	dir = path.dentry->d_parent->d_inode;
++
++	ret = __bch2_unlink(dir, path.dentry, true);
++	if (!ret) {
++		fsnotify_rmdir(dir, path.dentry);
++		d_delete(path.dentry);
++	}
++	path_put(&path);
++
++	return ret;
++}
++
++long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
++{
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++
++	switch (cmd) {
++	case FS_IOC_GETFLAGS:
++		return bch2_ioc_getflags(inode, (int __user *) arg);
++
++	case FS_IOC_SETFLAGS:
++		return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
++
++	case FS_IOC_FSGETXATTR:
++		return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
++	case FS_IOC_FSSETXATTR:
++		return bch2_ioc_fssetxattr(c, file, inode,
++					   (void __user *) arg);
++
++	case BCHFS_IOC_REINHERIT_ATTRS:
++		return bch2_ioc_reinherit_attrs(c, file, inode,
++						(void __user *) arg);
++
++	case FS_IOC_GETVERSION:
++		return -ENOTTY;
++	case FS_IOC_SETVERSION:
++		return -ENOTTY;
++
++	case FS_IOC_GOINGDOWN:
++		return bch2_ioc_goingdown(c, (u32 __user *) arg);
++
++	case BCH_IOCTL_SUBVOLUME_CREATE: {
++		struct bch_ioctl_subvolume i;
++
++		if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
++			return -EFAULT;
++		return bch2_ioctl_subvolume_create(c, file, i);
++	}
++
++	case BCH_IOCTL_SUBVOLUME_DESTROY: {
++		struct bch_ioctl_subvolume i;
++
++		if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
++			return -EFAULT;
++		return bch2_ioctl_subvolume_destroy(c, file, i);
++	}
++
++	default:
++		return bch2_fs_ioctl(c, cmd, (void __user *) arg);
++	}
++}
++
++#ifdef CONFIG_COMPAT
++long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
++{
++	/* These are just misnamed, they actually get/put from/to user an int */
++	switch (cmd) {
++	case FS_IOC_GETFLAGS:
++		cmd = FS_IOC_GETFLAGS;
++		break;
++	case FS_IOC32_SETFLAGS:
++		cmd = FS_IOC_SETFLAGS;
++		break;
++	default:
++		return -ENOIOCTLCMD;
++	}
++	return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
++}
++#endif
++
++#endif /* NO_BCACHEFS_FS */
+diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
+new file mode 100644
+index 000000000000..f201980ef2c3
+--- /dev/null
++++ b/fs/bcachefs/fs-ioctl.h
+@@ -0,0 +1,81 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FS_IOCTL_H
++#define _BCACHEFS_FS_IOCTL_H
++
++/* Inode flags: */
++
++/* bcachefs inode flags -> vfs inode flags: */
++static const unsigned bch_flags_to_vfs[] = {
++	[__BCH_INODE_SYNC]	= S_SYNC,
++	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
++	[__BCH_INODE_APPEND]	= S_APPEND,
++	[__BCH_INODE_NOATIME]	= S_NOATIME,
++};
++
++/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
++static const unsigned bch_flags_to_uflags[] = {
++	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
++	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
++	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
++	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
++	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
++};
++
++/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
++static const unsigned bch_flags_to_xflags[] = {
++	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
++	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
++	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
++	[__BCH_INODE_NODUMP]	= FS_XFLAG_NODUMP,
++	[__BCH_INODE_NOATIME]	= FS_XFLAG_NOATIME,
++	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
++};
++
++#define set_flags(_map, _in, _out)					\
++do {									\
++	unsigned _i;							\
++									\
++	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
++		if ((_in) & (1 << _i))					\
++			(_out) |= _map[_i];				\
++		else							\
++			(_out) &= ~_map[_i];				\
++} while (0)
++
++#define map_flags(_map, _in)						\
++({									\
++	unsigned _out = 0;						\
++									\
++	set_flags(_map, _in, _out);					\
++	_out;								\
++})
++
++#define map_flags_rev(_map, _in)					\
++({									\
++	unsigned _i, _out = 0;						\
++									\
++	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
++		if ((_in) & _map[_i]) {					\
++			(_out) |= 1 << _i;				\
++			(_in) &= ~_map[_i];				\
++		}							\
++	(_out);								\
++})
++
++#define map_defined(_map)						\
++({									\
++	unsigned _in = ~0;						\
++									\
++	map_flags_rev(_map, _in);					\
++})
++
++/* Set VFS inode flags from bcachefs inode: */
++static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
++{
++	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
++}
++
++long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
++long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
++
++#endif /* _BCACHEFS_FS_IOCTL_H */
+diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
+new file mode 100644
+index 000000000000..6d57bd87bfd5
+--- /dev/null
++++ b/fs/bcachefs/fs.c
+@@ -0,0 +1,1939 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifndef NO_BCACHEFS_FS
++
++#include "bcachefs.h"
++#include "acl.h"
++#include "bkey_buf.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "chardev.h"
++#include "dirent.h"
++#include "errcode.h"
++#include "extents.h"
++#include "fs.h"
++#include "fs-common.h"
++#include "fs-io.h"
++#include "fs-ioctl.h"
++#include "fsck.h"
++#include "inode.h"
++#include "io.h"
++#include "journal.h"
++#include "keylist.h"
++#include "quota.h"
++#include "super.h"
++#include "xattr.h"
++
++#include <linux/aio.h>
++#include <linux/backing-dev.h>
++#include <linux/exportfs.h>
++#include <linux/fiemap.h>
++#include <linux/module.h>
++#include <linux/pagemap.h>
++#include <linux/posix_acl.h>
++#include <linux/random.h>
++#include <linux/seq_file.h>
++#include <linux/statfs.h>
++#include <linux/string.h>
++#include <linux/xattr.h>
++
++static struct kmem_cache *bch2_inode_cache;
++
++static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
++				struct bch_inode_info *,
++				struct bch_inode_unpacked *,
++				struct bch_subvolume *);
++
++static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
++{
++	BUG_ON(atomic_long_read(&lock->v) == 0);
++
++	if (atomic_long_sub_return_release(i, &lock->v) == 0)
++		wake_up_all(&lock->wait);
++}
++
++static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
++{
++	long v = atomic_long_read(&lock->v), old;
++
++	do {
++		old = v;
++
++		if (i > 0 ? v < 0 : v > 0)
++			return false;
++	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
++					old, old + i)) != old);
++	return true;
++}
++
++static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
++{
++	wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
++}
++
++void bch2_pagecache_add_put(struct pagecache_lock *lock)
++{
++	__pagecache_lock_put(lock, 1);
++}
++
++bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
++{
++	return __pagecache_lock_tryget(lock, 1);
++}
++
++void bch2_pagecache_add_get(struct pagecache_lock *lock)
++{
++	__pagecache_lock_get(lock, 1);
++}
++
++void bch2_pagecache_block_put(struct pagecache_lock *lock)
++{
++	__pagecache_lock_put(lock, -1);
++}
++
++void bch2_pagecache_block_get(struct pagecache_lock *lock)
++{
++	__pagecache_lock_get(lock, -1);
++}
++
++void bch2_inode_update_after_write(struct btree_trans *trans,
++				   struct bch_inode_info *inode,
++				   struct bch_inode_unpacked *bi,
++				   unsigned fields)
++{
++	struct bch_fs *c = trans->c;
++
++	BUG_ON(bi->bi_inum != inode->v.i_ino);
++
++	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
++			       POS(0, bi->bi_inum),
++			       c->opts.inodes_use_key_cache);
++
++	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
++	i_uid_write(&inode->v, bi->bi_uid);
++	i_gid_write(&inode->v, bi->bi_gid);
++	inode->v.i_mode	= bi->bi_mode;
++
++	if (fields & ATTR_ATIME)
++		inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
++	if (fields & ATTR_MTIME)
++		inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
++	if (fields & ATTR_CTIME)
++		inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
++
++	inode->ei_inode		= *bi;
++
++	bch2_inode_flags_to_vfs(inode);
++}
++
++int __must_check bch2_write_inode(struct bch_fs *c,
++				  struct bch_inode_info *inode,
++				  inode_set_fn set,
++				  void *p, unsigned fields)
++{
++	struct btree_trans trans;
++	struct btree_iter iter = { NULL };
++	struct bch_inode_unpacked inode_u;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 512);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
++				BTREE_ITER_INTENT) ?:
++		(set ? set(inode, &inode_u, p) : 0) ?:
++		bch2_inode_write(&trans, &iter, &inode_u) ?:
++		bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
++
++	/*
++	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
++	 * this is important for inode updates via bchfs_write_index_update
++	 */
++	if (!ret)
++		bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
++
++	bch2_trans_iter_exit(&trans, &iter);
++
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	bch2_trans_exit(&trans);
++	return ret < 0 ? ret : 0;
++}
++
++int bch2_fs_quota_transfer(struct bch_fs *c,
++			   struct bch_inode_info *inode,
++			   struct bch_qid new_qid,
++			   unsigned qtypes,
++			   enum quota_acct_mode mode)
++{
++	unsigned i;
++	int ret;
++
++	qtypes &= enabled_qtypes(c);
++
++	for (i = 0; i < QTYP_NR; i++)
++		if (new_qid.q[i] == inode->ei_qid.q[i])
++			qtypes &= ~(1U << i);
++
++	if (!qtypes)
++		return 0;
++
++	mutex_lock(&inode->ei_quota_lock);
++
++	ret = bch2_quota_transfer(c, qtypes, new_qid,
++				  inode->ei_qid,
++				  inode->v.i_blocks +
++				  inode->ei_quota_reserved,
++				  mode);
++	if (!ret)
++		for (i = 0; i < QTYP_NR; i++)
++			if (qtypes & (1 << i))
++				inode->ei_qid.q[i] = new_qid.q[i];
++
++	mutex_unlock(&inode->ei_quota_lock);
++
++	return ret;
++}
++
++static int bch2_iget5_test(struct inode *vinode, void *p)
++{
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++	subvol_inum *inum = p;
++
++	return inode->ei_subvol == inum->subvol &&
++		inode->ei_inode.bi_inum == inum->inum;
++}
++
++static int bch2_iget5_set(struct inode *vinode, void *p)
++{
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++	subvol_inum *inum = p;
++
++	inode->v.i_ino		= inum->inum;
++	inode->ei_subvol	= inum->subvol;
++	inode->ei_inode.bi_inum	= inum->inum;
++	return 0;
++}
++
++static unsigned bch2_inode_hash(subvol_inum inum)
++{
++	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
++}
++
++struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
++{
++	struct bch_inode_unpacked inode_u;
++	struct bch_inode_info *inode;
++	struct btree_trans trans;
++	struct bch_subvolume subvol;
++	int ret;
++
++	inode = to_bch_ei(iget5_locked(c->vfs_sb,
++				       bch2_inode_hash(inum),
++				       bch2_iget5_test,
++				       bch2_iget5_set,
++				       &inum));
++	if (unlikely(!inode))
++		return ERR_PTR(-ENOMEM);
++	if (!(inode->v.i_state & I_NEW))
++		return &inode->v;
++
++	bch2_trans_init(&trans, c, 8, 0);
++	ret = lockrestart_do(&trans,
++		bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
++		bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
++
++	if (!ret)
++		bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
++	bch2_trans_exit(&trans);
++
++	if (ret) {
++		iget_failed(&inode->v);
++		return ERR_PTR(ret);
++	}
++
++	unlock_new_inode(&inode->v);
++
++	return &inode->v;
++}
++
++struct bch_inode_info *
++__bch2_create(struct user_namespace *mnt_userns,
++	      struct bch_inode_info *dir, struct dentry *dentry,
++	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
++	      unsigned flags)
++{
++	struct bch_fs *c = dir->v.i_sb->s_fs_info;
++	struct btree_trans trans;
++	struct bch_inode_unpacked dir_u;
++	struct bch_inode_info *inode, *old;
++	struct bch_inode_unpacked inode_u;
++	struct posix_acl *default_acl = NULL, *acl = NULL;
++	subvol_inum inum;
++	struct bch_subvolume subvol;
++	u64 journal_seq = 0;
++	int ret;
++
++	/*
++	 * preallocate acls + vfs inode before btree transaction, so that
++	 * nothing can fail after the transaction succeeds:
++	 */
++#ifdef CONFIG_BCACHEFS_POSIX_ACL
++	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
++	if (ret)
++		return ERR_PTR(ret);
++#endif
++	inode = to_bch_ei(new_inode(c->vfs_sb));
++	if (unlikely(!inode)) {
++		inode = ERR_PTR(-ENOMEM);
++		goto err;
++	}
++
++	bch2_inode_init_early(c, &inode_u);
++
++	if (!(flags & BCH_CREATE_TMPFILE))
++		mutex_lock(&dir->ei_update_lock);
++
++	bch2_trans_init(&trans, c, 8,
++			2048 + (!(flags & BCH_CREATE_TMPFILE)
++				? dentry->d_name.len : 0));
++retry:
++	bch2_trans_begin(&trans);
++
++	ret   = bch2_create_trans(&trans,
++				  inode_inum(dir), &dir_u, &inode_u,
++				  !(flags & BCH_CREATE_TMPFILE)
++				  ? &dentry->d_name : NULL,
++				  from_kuid(mnt_userns, current_fsuid()),
++				  from_kgid(mnt_userns, current_fsgid()),
++				  mode, rdev,
++				  default_acl, acl, snapshot_src, flags) ?:
++		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
++				KEY_TYPE_QUOTA_PREALLOC);
++	if (unlikely(ret))
++		goto err_before_quota;
++
++	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
++	inum.inum = inode_u.bi_inum;
++
++	ret   = bch2_subvolume_get(&trans, inum.subvol, true,
++				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
++		bch2_trans_commit(&trans, NULL, &journal_seq, 0);
++	if (unlikely(ret)) {
++		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
++				KEY_TYPE_QUOTA_WARN);
++err_before_quota:
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			goto retry;
++		goto err_trans;
++	}
++
++	if (!(flags & BCH_CREATE_TMPFILE)) {
++		bch2_inode_update_after_write(&trans, dir, &dir_u,
++					      ATTR_MTIME|ATTR_CTIME);
++		mutex_unlock(&dir->ei_update_lock);
++	}
++
++	bch2_iget5_set(&inode->v, &inum);
++	bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
++
++	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
++	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
++
++	/*
++	 * we must insert the new inode into the inode cache before calling
++	 * bch2_trans_exit() and dropping locks, else we could race with another
++	 * thread pulling the inode in and modifying it:
++	 */
++
++	inode->v.i_state |= I_CREATING;
++
++	old = to_bch_ei(inode_insert5(&inode->v,
++				      bch2_inode_hash(inum),
++				      bch2_iget5_test,
++				      bch2_iget5_set,
++				      &inum));
++	BUG_ON(!old);
++
++	if (unlikely(old != inode)) {
++		/*
++		 * We raced, another process pulled the new inode into cache
++		 * before us:
++		 */
++		make_bad_inode(&inode->v);
++		iput(&inode->v);
++
++		inode = old;
++	} else {
++		/*
++		 * we really don't want insert_inode_locked2() to be setting
++		 * I_NEW...
++		 */
++		unlock_new_inode(&inode->v);
++	}
++
++	bch2_trans_exit(&trans);
++err:
++	posix_acl_release(default_acl);
++	posix_acl_release(acl);
++	return inode;
++err_trans:
++	if (!(flags & BCH_CREATE_TMPFILE))
++		mutex_unlock(&dir->ei_update_lock);
++
++	bch2_trans_exit(&trans);
++	make_bad_inode(&inode->v);
++	iput(&inode->v);
++	inode = ERR_PTR(ret);
++	goto err;
++}
++
++/* methods */
++
++static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
++				  unsigned int flags)
++{
++	struct bch_fs *c = vdir->i_sb->s_fs_info;
++	struct bch_inode_info *dir = to_bch_ei(vdir);
++	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
++	struct inode *vinode = NULL;
++	subvol_inum inum = { .subvol = 1 };
++	int ret;
++
++	ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
++				 &dentry->d_name, &inum);
++
++	if (!ret)
++		vinode = bch2_vfs_inode_get(c, inum);
++
++	return d_splice_alias(vinode, dentry);
++}
++
++static int bch2_mknod(struct user_namespace *mnt_userns,
++		      struct inode *vdir, struct dentry *dentry,
++		      umode_t mode, dev_t rdev)
++{
++	struct bch_inode_info *inode =
++		__bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
++			      (subvol_inum) { 0 }, 0);
++
++	if (IS_ERR(inode))
++		return PTR_ERR(inode);
++
++	d_instantiate(dentry, &inode->v);
++	return 0;
++}
++
++static int bch2_create(struct user_namespace *mnt_userns,
++		       struct inode *vdir, struct dentry *dentry,
++		       umode_t mode, bool excl)
++{
++	return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0);
++}
++
++static int __bch2_link(struct bch_fs *c,
++		       struct bch_inode_info *inode,
++		       struct bch_inode_info *dir,
++		       struct dentry *dentry)
++{
++	struct btree_trans trans;
++	struct bch_inode_unpacked dir_u, inode_u;
++	int ret;
++
++	mutex_lock(&inode->ei_update_lock);
++	bch2_trans_init(&trans, c, 4, 1024);
++
++	ret = commit_do(&trans, NULL, NULL, 0,
++			bch2_link_trans(&trans,
++					inode_inum(dir),   &dir_u,
++					inode_inum(inode), &inode_u,
++					&dentry->d_name));
++
++	if (likely(!ret)) {
++		bch2_inode_update_after_write(&trans, dir, &dir_u,
++					      ATTR_MTIME|ATTR_CTIME);
++		bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
++	}
++
++	bch2_trans_exit(&trans);
++	mutex_unlock(&inode->ei_update_lock);
++	return ret;
++}
++
++static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
++		     struct dentry *dentry)
++{
++	struct bch_fs *c = vdir->i_sb->s_fs_info;
++	struct bch_inode_info *dir = to_bch_ei(vdir);
++	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
++	int ret;
++
++	lockdep_assert_held(&inode->v.i_rwsem);
++
++	ret = __bch2_link(c, inode, dir, dentry);
++	if (unlikely(ret))
++		return ret;
++
++	ihold(&inode->v);
++	d_instantiate(dentry, &inode->v);
++	return 0;
++}
++
++int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
++		  bool deleting_snapshot)
++{
++	struct bch_fs *c = vdir->i_sb->s_fs_info;
++	struct bch_inode_info *dir = to_bch_ei(vdir);
++	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
++	struct bch_inode_unpacked dir_u, inode_u;
++	struct btree_trans trans;
++	int ret;
++
++	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
++	bch2_trans_init(&trans, c, 4, 1024);
++
++	ret = commit_do(&trans, NULL, NULL,
++			      BTREE_INSERT_NOFAIL,
++			bch2_unlink_trans(&trans,
++					  inode_inum(dir), &dir_u,
++					  &inode_u, &dentry->d_name,
++					  deleting_snapshot));
++
++	if (likely(!ret)) {
++		bch2_inode_update_after_write(&trans, dir, &dir_u,
++					      ATTR_MTIME|ATTR_CTIME);
++		bch2_inode_update_after_write(&trans, inode, &inode_u,
++					      ATTR_MTIME);
++	}
++
++	bch2_trans_exit(&trans);
++	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
++
++	return ret;
++}
++
++static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
++{
++	return __bch2_unlink(vdir, dentry, false);
++}
++
++static int bch2_symlink(struct user_namespace *mnt_userns,
++			struct inode *vdir, struct dentry *dentry,
++			const char *symname)
++{
++	struct bch_fs *c = vdir->i_sb->s_fs_info;
++	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
++	int ret;
++
++	inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
++			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
++	if (unlikely(IS_ERR(inode)))
++		return PTR_ERR(inode);
++
++	inode_lock(&inode->v);
++	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
++	inode_unlock(&inode->v);
++
++	if (unlikely(ret))
++		goto err;
++
++	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
++	if (unlikely(ret))
++		goto err;
++
++	ret = __bch2_link(c, inode, dir, dentry);
++	if (unlikely(ret))
++		goto err;
++
++	d_instantiate(dentry, &inode->v);
++	return 0;
++err:
++	iput(&inode->v);
++	return ret;
++}
++
++static int bch2_mkdir(struct user_namespace *mnt_userns,
++		      struct inode *vdir, struct dentry *dentry, umode_t mode)
++{
++	return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0);
++}
++
++static int bch2_rename2(struct user_namespace *mnt_userns,
++			struct inode *src_vdir, struct dentry *src_dentry,
++			struct inode *dst_vdir, struct dentry *dst_dentry,
++			unsigned flags)
++{
++	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
++	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
++	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
++	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
++	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
++	struct bch_inode_unpacked dst_dir_u, src_dir_u;
++	struct bch_inode_unpacked src_inode_u, dst_inode_u;
++	struct btree_trans trans;
++	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
++		? BCH_RENAME_EXCHANGE
++		: dst_dentry->d_inode
++		? BCH_RENAME_OVERWRITE : BCH_RENAME;
++	int ret;
++
++	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
++		return -EINVAL;
++
++	if (mode == BCH_RENAME_OVERWRITE) {
++		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
++						   0, LLONG_MAX);
++		if (ret)
++			return ret;
++	}
++
++	bch2_trans_init(&trans, c, 8, 2048);
++
++	bch2_lock_inodes(INODE_UPDATE_LOCK,
++			 src_dir,
++			 dst_dir,
++			 src_inode,
++			 dst_inode);
++
++	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
++		ret = bch2_fs_quota_transfer(c, src_inode,
++					     dst_dir->ei_qid,
++					     1 << QTYP_PRJ,
++					     KEY_TYPE_QUOTA_PREALLOC);
++		if (ret)
++			goto err;
++	}
++
++	if (mode == BCH_RENAME_EXCHANGE &&
++	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
++		ret = bch2_fs_quota_transfer(c, dst_inode,
++					     src_dir->ei_qid,
++					     1 << QTYP_PRJ,
++					     KEY_TYPE_QUOTA_PREALLOC);
++		if (ret)
++			goto err;
++	}
++
++	ret = commit_do(&trans, NULL, NULL, 0,
++			bch2_rename_trans(&trans,
++					  inode_inum(src_dir), &src_dir_u,
++					  inode_inum(dst_dir), &dst_dir_u,
++					  &src_inode_u,
++					  &dst_inode_u,
++					  &src_dentry->d_name,
++					  &dst_dentry->d_name,
++					  mode));
++	if (unlikely(ret))
++		goto err;
++
++	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
++	BUG_ON(dst_inode &&
++	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
++
++	bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
++				      ATTR_MTIME|ATTR_CTIME);
++
++	if (src_dir != dst_dir)
++		bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
++					      ATTR_MTIME|ATTR_CTIME);
++
++	bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
++				      ATTR_CTIME);
++
++	if (dst_inode)
++		bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
++					      ATTR_CTIME);
++err:
++	bch2_trans_exit(&trans);
++
++	bch2_fs_quota_transfer(c, src_inode,
++			       bch_qid(&src_inode->ei_inode),
++			       1 << QTYP_PRJ,
++			       KEY_TYPE_QUOTA_NOCHECK);
++	if (dst_inode)
++		bch2_fs_quota_transfer(c, dst_inode,
++				       bch_qid(&dst_inode->ei_inode),
++				       1 << QTYP_PRJ,
++				       KEY_TYPE_QUOTA_NOCHECK);
++
++	bch2_unlock_inodes(INODE_UPDATE_LOCK,
++			   src_dir,
++			   dst_dir,
++			   src_inode,
++			   dst_inode);
++
++	return ret;
++}
++
++static void bch2_setattr_copy(struct user_namespace *mnt_userns,
++			      struct bch_inode_info *inode,
++			      struct bch_inode_unpacked *bi,
++			      struct iattr *attr)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	unsigned int ia_valid = attr->ia_valid;
++
++	if (ia_valid & ATTR_UID)
++		bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid);
++	if (ia_valid & ATTR_GID)
++		bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid);
++
++	if (ia_valid & ATTR_SIZE)
++		bi->bi_size = attr->ia_size;
++
++	if (ia_valid & ATTR_ATIME)
++		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
++	if (ia_valid & ATTR_MTIME)
++		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
++	if (ia_valid & ATTR_CTIME)
++		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
++
++	if (ia_valid & ATTR_MODE) {
++		umode_t mode = attr->ia_mode;
++		kgid_t gid = ia_valid & ATTR_GID
++			? attr->ia_gid
++			: inode->v.i_gid;
++
++		if (!in_group_p(gid) &&
++		    !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID))
++			mode &= ~S_ISGID;
++		bi->bi_mode = mode;
++	}
++}
++
++int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
++			 struct bch_inode_info *inode,
++			 struct iattr *attr)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_qid qid;
++	struct btree_trans trans;
++	struct btree_iter inode_iter = { NULL };
++	struct bch_inode_unpacked inode_u;
++	struct posix_acl *acl = NULL;
++	int ret;
++
++	mutex_lock(&inode->ei_update_lock);
++
++	qid = inode->ei_qid;
++
++	if (attr->ia_valid & ATTR_UID)
++		qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
++
++	if (attr->ia_valid & ATTR_GID)
++		qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
++
++	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
++				     KEY_TYPE_QUOTA_PREALLOC);
++	if (ret)
++		goto err;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++	kfree(acl);
++	acl = NULL;
++
++	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
++			      BTREE_ITER_INTENT);
++	if (ret)
++		goto btree_err;
++
++	bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
++
++	if (attr->ia_valid & ATTR_MODE) {
++		ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
++				     inode_u.bi_mode, &acl);
++		if (ret)
++			goto btree_err;
++	}
++
++	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
++		bch2_trans_commit(&trans, NULL, NULL,
++				  BTREE_INSERT_NOFAIL);
++btree_err:
++	bch2_trans_iter_exit(&trans, &inode_iter);
++
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++	if (unlikely(ret))
++		goto err_trans;
++
++	bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
++
++	if (acl)
++		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
++err_trans:
++	bch2_trans_exit(&trans);
++err:
++	mutex_unlock(&inode->ei_update_lock);
++
++	return ret;
++}
++
++static int bch2_getattr(struct user_namespace *mnt_userns,
++			const struct path *path, struct kstat *stat,
++			u32 request_mask, unsigned query_flags)
++{
++	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++
++	stat->dev	= inode->v.i_sb->s_dev;
++	stat->ino	= inode->v.i_ino;
++	stat->mode	= inode->v.i_mode;
++	stat->nlink	= inode->v.i_nlink;
++	stat->uid	= inode->v.i_uid;
++	stat->gid	= inode->v.i_gid;
++	stat->rdev	= inode->v.i_rdev;
++	stat->size	= i_size_read(&inode->v);
++	stat->atime	= inode->v.i_atime;
++	stat->mtime	= inode->v.i_mtime;
++	stat->ctime	= inode->v.i_ctime;
++	stat->blksize	= block_bytes(c);
++	stat->blocks	= inode->v.i_blocks;
++
++	if (request_mask & STATX_BTIME) {
++		stat->result_mask |= STATX_BTIME;
++		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
++	}
++
++	if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
++		stat->attributes |= STATX_ATTR_IMMUTABLE;
++	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
++
++	if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
++		stat->attributes |= STATX_ATTR_APPEND;
++	stat->attributes_mask	 |= STATX_ATTR_APPEND;
++
++	if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
++		stat->attributes |= STATX_ATTR_NODUMP;
++	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
++
++	return 0;
++}
++
++static int bch2_setattr(struct user_namespace *mnt_userns,
++			struct dentry *dentry, struct iattr *iattr)
++{
++	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
++	int ret;
++
++	lockdep_assert_held(&inode->v.i_rwsem);
++
++	ret = setattr_prepare(mnt_userns, dentry, iattr);
++	if (ret)
++		return ret;
++
++	return iattr->ia_valid & ATTR_SIZE
++		? bch2_truncate(mnt_userns, inode, iattr)
++		: bch2_setattr_nonsize(mnt_userns, inode, iattr);
++}
++
++static int bch2_tmpfile(struct user_namespace *mnt_userns,
++			struct inode *vdir, struct dentry *dentry, umode_t mode)
++{
++	struct bch_inode_info *inode =
++		__bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
++			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
++
++	if (IS_ERR(inode))
++		return PTR_ERR(inode);
++
++	d_mark_tmpfile(dentry, &inode->v);
++	d_instantiate(dentry, &inode->v);
++	return 0;
++}
++
++static int bch2_fill_extent(struct bch_fs *c,
++			    struct fiemap_extent_info *info,
++			    struct bkey_s_c k, unsigned flags)
++{
++	if (bkey_extent_is_direct_data(k.k)) {
++		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++		const union bch_extent_entry *entry;
++		struct extent_ptr_decoded p;
++		int ret;
++
++		if (k.k->type == KEY_TYPE_reflink_v)
++			flags |= FIEMAP_EXTENT_SHARED;
++
++		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++			int flags2 = 0;
++			u64 offset = p.ptr.offset;
++
++			if (p.crc.compression_type)
++				flags2 |= FIEMAP_EXTENT_ENCODED;
++			else
++				offset += p.crc.offset;
++
++			if ((offset & (block_sectors(c) - 1)) ||
++			    (k.k->size & (block_sectors(c) - 1)))
++				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
++
++			ret = fiemap_fill_next_extent(info,
++						bkey_start_offset(k.k) << 9,
++						offset << 9,
++						k.k->size << 9, flags|flags2);
++			if (ret)
++				return ret;
++		}
++
++		return 0;
++	} else if (bkey_extent_is_inline_data(k.k)) {
++		return fiemap_fill_next_extent(info,
++					       bkey_start_offset(k.k) << 9,
++					       0, k.k->size << 9,
++					       flags|
++					       FIEMAP_EXTENT_DATA_INLINE);
++	} else if (k.k->type == KEY_TYPE_reservation) {
++		return fiemap_fill_next_extent(info,
++					       bkey_start_offset(k.k) << 9,
++					       0, k.k->size << 9,
++					       flags|
++					       FIEMAP_EXTENT_DELALLOC|
++					       FIEMAP_EXTENT_UNWRITTEN);
++	} else {
++		BUG();
++	}
++}
++
++static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
++		       u64 start, u64 len)
++{
++	struct bch_fs *c = vinode->i_sb->s_fs_info;
++	struct bch_inode_info *ei = to_bch_ei(vinode);
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_buf cur, prev;
++	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
++	unsigned offset_into_extent, sectors;
++	bool have_extent = false;
++	u32 snapshot;
++	int ret = 0;
++
++	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
++	if (ret)
++		return ret;
++
++	if (start + len < start)
++		return -EINVAL;
++
++	start >>= 9;
++
++	bch2_bkey_buf_init(&cur);
++	bch2_bkey_buf_init(&prev);
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
++			     SPOS(ei->v.i_ino, start, snapshot), 0);
++
++	while (!(ret = btree_trans_too_many_iters(&trans)) &&
++	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
++	       !(ret = bkey_err(k))) {
++		enum btree_id data_btree = BTREE_ID_extents;
++
++		if (!bkey_extent_is_data(k.k) &&
++		    k.k->type != KEY_TYPE_reservation) {
++			bch2_btree_iter_advance(&iter);
++			continue;
++		}
++
++		offset_into_extent	= iter.pos.offset -
++			bkey_start_offset(k.k);
++		sectors			= k.k->size - offset_into_extent;
++
++		bch2_bkey_buf_reassemble(&cur, c, k);
++
++		ret = bch2_read_indirect_extent(&trans, &data_btree,
++					&offset_into_extent, &cur);
++		if (ret)
++			break;
++
++		k = bkey_i_to_s_c(cur.k);
++		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
++
++		sectors = min(sectors, k.k->size - offset_into_extent);
++
++		bch2_cut_front(POS(k.k->p.inode,
++				   bkey_start_offset(k.k) +
++				   offset_into_extent),
++			       cur.k);
++		bch2_key_resize(&cur.k->k, sectors);
++		cur.k->k.p = iter.pos;
++		cur.k->k.p.offset += cur.k->k.size;
++
++		if (have_extent) {
++			ret = bch2_fill_extent(c, info,
++					bkey_i_to_s_c(prev.k), 0);
++			if (ret)
++				break;
++		}
++
++		bkey_copy(prev.k, cur.k);
++		have_extent = true;
++
++		bch2_btree_iter_set_pos(&iter,
++			POS(iter.pos.inode, iter.pos.offset + sectors));
++	}
++	start = iter.pos.offset;
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	if (!ret && have_extent)
++		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
++				       FIEMAP_EXTENT_LAST);
++
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&cur, c);
++	bch2_bkey_buf_exit(&prev, c);
++	return ret < 0 ? ret : 0;
++}
++
++static const struct vm_operations_struct bch_vm_ops = {
++	.fault		= bch2_page_fault,
++	.map_pages	= filemap_map_pages,
++	.page_mkwrite   = bch2_page_mkwrite,
++};
++
++static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
++{
++	file_accessed(file);
++
++	vma->vm_ops = &bch_vm_ops;
++	return 0;
++}
++
++/* Directories: */
++
++static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
++{
++	return generic_file_llseek_size(file, offset, whence,
++					S64_MAX, S64_MAX);
++}
++
++static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
++{
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++
++	if (!dir_emit_dots(file, ctx))
++		return 0;
++
++	return bch2_readdir(c, inode_inum(inode), ctx);
++}
++
++static const struct file_operations bch_file_operations = {
++	.llseek		= bch2_llseek,
++	.read_iter	= bch2_read_iter,
++	.write_iter	= bch2_write_iter,
++	.mmap		= bch2_mmap,
++	.open		= generic_file_open,
++	.fsync		= bch2_fsync,
++	.splice_read	= generic_file_splice_read,
++	.splice_write	= iter_file_splice_write,
++	.fallocate	= bch2_fallocate_dispatch,
++	.unlocked_ioctl = bch2_fs_file_ioctl,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	= bch2_compat_fs_ioctl,
++#endif
++	.remap_file_range = bch2_remap_file_range,
++};
++
++static const struct inode_operations bch_file_inode_operations = {
++	.getattr	= bch2_getattr,
++	.setattr	= bch2_setattr,
++	.fiemap		= bch2_fiemap,
++	.listxattr	= bch2_xattr_list,
++#ifdef CONFIG_BCACHEFS_POSIX_ACL
++	.get_acl	= bch2_get_acl,
++	.set_acl	= bch2_set_acl,
++#endif
++};
++
++static const struct inode_operations bch_dir_inode_operations = {
++	.lookup		= bch2_lookup,
++	.create		= bch2_create,
++	.link		= bch2_link,
++	.unlink		= bch2_unlink,
++	.symlink	= bch2_symlink,
++	.mkdir		= bch2_mkdir,
++	.rmdir		= bch2_unlink,
++	.mknod		= bch2_mknod,
++	.rename		= bch2_rename2,
++	.getattr	= bch2_getattr,
++	.setattr	= bch2_setattr,
++	.tmpfile	= bch2_tmpfile,
++	.listxattr	= bch2_xattr_list,
++#ifdef CONFIG_BCACHEFS_POSIX_ACL
++	.get_acl	= bch2_get_acl,
++	.set_acl	= bch2_set_acl,
++#endif
++};
++
++static const struct file_operations bch_dir_file_operations = {
++	.llseek		= bch2_dir_llseek,
++	.read		= generic_read_dir,
++	.iterate_shared	= bch2_vfs_readdir,
++	.fsync		= bch2_fsync,
++	.unlocked_ioctl = bch2_fs_file_ioctl,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	= bch2_compat_fs_ioctl,
++#endif
++};
++
++static const struct inode_operations bch_symlink_inode_operations = {
++	.get_link	= page_get_link,
++	.getattr	= bch2_getattr,
++	.setattr	= bch2_setattr,
++	.listxattr	= bch2_xattr_list,
++#ifdef CONFIG_BCACHEFS_POSIX_ACL
++	.get_acl	= bch2_get_acl,
++	.set_acl	= bch2_set_acl,
++#endif
++};
++
++static const struct inode_operations bch_special_inode_operations = {
++	.getattr	= bch2_getattr,
++	.setattr	= bch2_setattr,
++	.listxattr	= bch2_xattr_list,
++#ifdef CONFIG_BCACHEFS_POSIX_ACL
++	.get_acl	= bch2_get_acl,
++	.set_acl	= bch2_set_acl,
++#endif
++};
++
++static const struct address_space_operations bch_address_space_operations = {
++	.readpage	= bch2_readpage,
++	.writepages	= bch2_writepages,
++	.readahead	= bch2_readahead,
++	.dirty_folio	= filemap_dirty_folio,
++	.write_begin	= bch2_write_begin,
++	.write_end	= bch2_write_end,
++	.invalidate_folio = bch2_invalidate_folio,
++	.releasepage	= bch2_releasepage,
++	.direct_IO	= noop_direct_IO,
++#ifdef CONFIG_MIGRATION
++	.migratepage	= bch2_migrate_page,
++#endif
++	.error_remove_page = generic_error_remove_page,
++};
++
++struct bcachefs_fid {
++	u64		inum;
++	u32		subvol;
++	u32		gen;
++} __packed;
++
++struct bcachefs_fid_with_parent {
++	struct bcachefs_fid	fid;
++	struct bcachefs_fid	dir;
++} __packed;
++
++static int bcachefs_fid_valid(int fh_len, int fh_type)
++{
++	switch (fh_type) {
++	case FILEID_BCACHEFS_WITHOUT_PARENT:
++		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
++	case FILEID_BCACHEFS_WITH_PARENT:
++		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
++	default:
++		return false;
++	}
++}
++
++static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
++{
++	return (struct bcachefs_fid) {
++		.inum	= inode->ei_inode.bi_inum,
++		.subvol	= inode->ei_subvol,
++		.gen	= inode->ei_inode.bi_generation,
++	};
++}
++
++static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
++			  struct inode *vdir)
++{
++	struct bch_inode_info *inode	= to_bch_ei(vinode);
++	struct bch_inode_info *dir	= to_bch_ei(vdir);
++
++	if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
++		return FILEID_INVALID;
++
++	if (!S_ISDIR(inode->v.i_mode) && dir) {
++		struct bcachefs_fid_with_parent *fid = (void *) fh;
++
++		fid->fid = bch2_inode_to_fid(inode);
++		fid->dir = bch2_inode_to_fid(dir);
++
++		*len = sizeof(*fid) / sizeof(u32);
++		return FILEID_BCACHEFS_WITH_PARENT;
++	} else {
++		struct bcachefs_fid *fid = (void *) fh;
++
++		*fid = bch2_inode_to_fid(inode);
++
++		*len = sizeof(*fid) / sizeof(u32);
++		return FILEID_BCACHEFS_WITHOUT_PARENT;
++	}
++}
++
++static struct inode *bch2_nfs_get_inode(struct super_block *sb,
++					struct bcachefs_fid fid)
++{
++	struct bch_fs *c = sb->s_fs_info;
++	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
++				    .subvol = fid.subvol,
++				    .inum = fid.inum,
++	});
++	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
++		iput(vinode);
++		vinode = ERR_PTR(-ESTALE);
++	}
++	return vinode;
++}
++
++static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
++		int fh_len, int fh_type)
++{
++	struct bcachefs_fid *fid = (void *) _fid;
++
++	if (!bcachefs_fid_valid(fh_len, fh_type))
++		return NULL;
++
++	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
++}
++
++static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
++		int fh_len, int fh_type)
++{
++	struct bcachefs_fid_with_parent *fid = (void *) _fid;
++
++	if (!bcachefs_fid_valid(fh_len, fh_type) ||
++	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
++		return NULL;
++
++	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
++}
++
++static struct dentry *bch2_get_parent(struct dentry *child)
++{
++	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	subvol_inum parent_inum = {
++		.subvol = inode->ei_inode.bi_parent_subvol ?:
++			inode->ei_subvol,
++		.inum = inode->ei_inode.bi_dir,
++	};
++
++	if (!parent_inum.inum)
++		return NULL;
++
++	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
++}
++
++static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
++{
++	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
++	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct btree_trans trans;
++	struct btree_iter iter1;
++	struct btree_iter iter2;
++	struct bkey_s_c k;
++	struct bkey_s_c_dirent d;
++	struct bch_inode_unpacked inode_u;
++	subvol_inum target;
++	u32 snapshot;
++	unsigned name_len;
++	int ret;
++
++	if (!S_ISDIR(dir->v.i_mode))
++		return -EINVAL;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
++			     POS(dir->ei_inode.bi_inum, 0), 0);
++	bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
++			     POS(dir->ei_inode.bi_inum, 0), 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	bch2_btree_iter_set_snapshot(&iter1, snapshot);
++	bch2_btree_iter_set_snapshot(&iter2, snapshot);
++
++	ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
++	if (ret)
++		goto err;
++
++	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
++		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
++
++		k = bch2_btree_iter_peek_slot(&iter1);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (k.k->type != KEY_TYPE_dirent) {
++			ret = -ENOENT;
++			goto err;
++		}
++
++		d = bkey_s_c_to_dirent(k);
++		ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
++		if (ret > 0)
++			ret = -ENOENT;
++		if (ret)
++			goto err;
++
++		if (target.subvol	== inode->ei_subvol &&
++		    target.inum		== inode->ei_inode.bi_inum)
++			goto found;
++	} else {
++		/*
++		 * File with multiple hardlinks and our backref is to the wrong
++		 * directory - linear search:
++		 */
++		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
++			if (k.k->p.inode > dir->ei_inode.bi_inum)
++				break;
++
++			if (k.k->type != KEY_TYPE_dirent)
++				continue;
++
++			d = bkey_s_c_to_dirent(k);
++			ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
++			if (ret < 0)
++				break;
++			if (ret)
++				continue;
++
++			if (target.subvol	== inode->ei_subvol &&
++			    target.inum		== inode->ei_inode.bi_inum)
++				goto found;
++		}
++	}
++
++	ret = -ENOENT;
++	goto err;
++found:
++	name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
++
++	memcpy(name, d.v->d_name, name_len);
++	name[name_len] = '\0';
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	bch2_trans_iter_exit(&trans, &iter1);
++	bch2_trans_iter_exit(&trans, &iter2);
++	bch2_trans_exit(&trans);
++
++	return ret;
++}
++
++static const struct export_operations bch_export_ops = {
++	.encode_fh	= bch2_encode_fh,
++	.fh_to_dentry	= bch2_fh_to_dentry,
++	.fh_to_parent	= bch2_fh_to_parent,
++	.get_parent	= bch2_get_parent,
++	.get_name	= bch2_get_name,
++};
++
++static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
++				struct bch_inode_info *inode,
++				struct bch_inode_unpacked *bi,
++				struct bch_subvolume *subvol)
++{
++	bch2_inode_update_after_write(trans, inode, bi, ~0);
++
++	if (BCH_SUBVOLUME_SNAP(subvol))
++		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
++	else
++		clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
++
++	inode->v.i_blocks	= bi->bi_sectors;
++	inode->v.i_ino		= bi->bi_inum;
++	inode->v.i_rdev		= bi->bi_dev;
++	inode->v.i_generation	= bi->bi_generation;
++	inode->v.i_size		= bi->bi_size;
++
++	inode->ei_flags		= 0;
++	inode->ei_quota_reserved = 0;
++	inode->ei_qid		= bch_qid(bi);
++	inode->ei_subvol	= inum.subvol;
++
++	inode->v.i_mapping->a_ops = &bch_address_space_operations;
++
++	switch (inode->v.i_mode & S_IFMT) {
++	case S_IFREG:
++		inode->v.i_op	= &bch_file_inode_operations;
++		inode->v.i_fop	= &bch_file_operations;
++		break;
++	case S_IFDIR:
++		inode->v.i_op	= &bch_dir_inode_operations;
++		inode->v.i_fop	= &bch_dir_file_operations;
++		break;
++	case S_IFLNK:
++		inode_nohighmem(&inode->v);
++		inode->v.i_op	= &bch_symlink_inode_operations;
++		break;
++	default:
++		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
++		inode->v.i_op	= &bch_special_inode_operations;
++		break;
++	}
++}
++
++static struct inode *bch2_alloc_inode(struct super_block *sb)
++{
++	struct bch_inode_info *inode;
++
++	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
++	if (!inode)
++		return NULL;
++
++	inode_init_once(&inode->v);
++	mutex_init(&inode->ei_update_lock);
++	pagecache_lock_init(&inode->ei_pagecache_lock);
++	mutex_init(&inode->ei_quota_lock);
++
++	return &inode->v;
++}
++
++static void bch2_i_callback(struct rcu_head *head)
++{
++	struct inode *vinode = container_of(head, struct inode, i_rcu);
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++
++	kmem_cache_free(bch2_inode_cache, inode);
++}
++
++static void bch2_destroy_inode(struct inode *vinode)
++{
++	call_rcu(&vinode->i_rcu, bch2_i_callback);
++}
++
++static int inode_update_times_fn(struct bch_inode_info *inode,
++				 struct bch_inode_unpacked *bi,
++				 void *p)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++
++	bi->bi_atime	= timespec_to_bch2_time(c, inode->v.i_atime);
++	bi->bi_mtime	= timespec_to_bch2_time(c, inode->v.i_mtime);
++	bi->bi_ctime	= timespec_to_bch2_time(c, inode->v.i_ctime);
++
++	return 0;
++}
++
++static int bch2_vfs_write_inode(struct inode *vinode,
++				struct writeback_control *wbc)
++{
++	struct bch_fs *c = vinode->i_sb->s_fs_info;
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++	int ret;
++
++	mutex_lock(&inode->ei_update_lock);
++	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
++			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
++	mutex_unlock(&inode->ei_update_lock);
++
++	return ret;
++}
++
++static void bch2_evict_inode(struct inode *vinode)
++{
++	struct bch_fs *c = vinode->i_sb->s_fs_info;
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++
++	truncate_inode_pages_final(&inode->v.i_data);
++
++	clear_inode(&inode->v);
++
++	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
++
++	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
++		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
++				KEY_TYPE_QUOTA_WARN);
++		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
++				KEY_TYPE_QUOTA_WARN);
++		bch2_inode_rm(c, inode_inum(inode));
++	}
++}
++
++void bch2_evict_subvolume_inodes(struct bch_fs *c,
++				 snapshot_id_list *s)
++{
++	struct super_block *sb = c->vfs_sb;
++	struct inode *inode;
++
++	spin_lock(&sb->s_inode_list_lock);
++	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
++		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
++		    (inode->i_state & I_FREEING))
++			continue;
++
++		d_mark_dontcache(inode);
++		d_prune_aliases(inode);
++	}
++	spin_unlock(&sb->s_inode_list_lock);
++again:
++	cond_resched();
++	spin_lock(&sb->s_inode_list_lock);
++	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
++		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
++		    (inode->i_state & I_FREEING))
++			continue;
++
++		if (!(inode->i_state & I_DONTCACHE)) {
++			d_mark_dontcache(inode);
++			d_prune_aliases(inode);
++		}
++
++		spin_lock(&inode->i_lock);
++		if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
++		    !(inode->i_state & I_FREEING)) {
++			wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
++			DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
++			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
++			spin_unlock(&inode->i_lock);
++			spin_unlock(&sb->s_inode_list_lock);
++			schedule();
++			finish_wait(wq, &wait.wq_entry);
++			goto again;
++		}
++
++		spin_unlock(&inode->i_lock);
++	}
++	spin_unlock(&sb->s_inode_list_lock);
++}
++
++static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
++{
++	struct super_block *sb = dentry->d_sb;
++	struct bch_fs *c = sb->s_fs_info;
++	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
++	unsigned shift = sb->s_blocksize_bits - 9;
++	/*
++	 * this assumes inodes take up 64 bytes, which is a decent average
++	 * number:
++	 */
++	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
++	u64 fsid;
++
++	buf->f_type	= BCACHEFS_STATFS_MAGIC;
++	buf->f_bsize	= sb->s_blocksize;
++	buf->f_blocks	= usage.capacity >> shift;
++	buf->f_bfree	= usage.free >> shift;
++	buf->f_bavail	= avail_factor(usage.free) >> shift;
++
++	buf->f_files	= usage.nr_inodes + avail_inodes;
++	buf->f_ffree	= avail_inodes;
++
++	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
++	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
++	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
++	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
++	buf->f_namelen	= BCH_NAME_MAX;
++
++	return 0;
++}
++
++static int bch2_sync_fs(struct super_block *sb, int wait)
++{
++	struct bch_fs *c = sb->s_fs_info;
++
++	if (c->opts.journal_flush_disabled)
++		return 0;
++
++	if (!wait) {
++		bch2_journal_flush_async(&c->journal, NULL);
++		return 0;
++	}
++
++	return bch2_journal_flush(&c->journal);
++}
++
++static struct bch_fs *bch2_path_to_fs(const char *path)
++{
++	struct bch_fs *c;
++	dev_t dev;
++	int ret;
++
++	ret = lookup_bdev(path, &dev);
++	if (ret)
++		return ERR_PTR(ret);
++
++	c = bch2_dev_to_fs(dev);
++	if (c)
++		closure_put(&c->cl);
++	return c ?: ERR_PTR(-ENOENT);
++}
++
++static char **split_devs(const char *_dev_name, unsigned *nr)
++{
++	char *dev_name = NULL, **devs = NULL, *s;
++	size_t i, nr_devs = 0;
++
++	dev_name = kstrdup(_dev_name, GFP_KERNEL);
++	if (!dev_name)
++		return NULL;
++
++	for (s = dev_name; s; s = strchr(s + 1, ':'))
++		nr_devs++;
++
++	devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
++	if (!devs) {
++		kfree(dev_name);
++		return NULL;
++	}
++
++	for (i = 0, s = dev_name;
++	     s;
++	     (s = strchr(s, ':')) && (*s++ = '\0'))
++		devs[i++] = s;
++
++	*nr = nr_devs;
++	return devs;
++}
++
++static int bch2_remount(struct super_block *sb, int *flags, char *data)
++{
++	struct bch_fs *c = sb->s_fs_info;
++	struct bch_opts opts = bch2_opts_empty();
++	int ret;
++
++	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
++
++	ret = bch2_parse_mount_opts(c, &opts, data);
++	if (ret)
++		return ret;
++
++	if (opts.read_only != c->opts.read_only) {
++		down_write(&c->state_lock);
++
++		if (opts.read_only) {
++			bch2_fs_read_only(c);
++
++			sb->s_flags |= SB_RDONLY;
++		} else {
++			ret = bch2_fs_read_write(c);
++			if (ret) {
++				bch_err(c, "error going rw: %i", ret);
++				up_write(&c->state_lock);
++				return -EINVAL;
++			}
++
++			sb->s_flags &= ~SB_RDONLY;
++		}
++
++		c->opts.read_only = opts.read_only;
++
++		up_write(&c->state_lock);
++	}
++
++	if (opts.errors >= 0)
++		c->opts.errors = opts.errors;
++
++	return ret;
++}
++
++static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
++{
++	struct bch_fs *c = root->d_sb->s_fs_info;
++	struct bch_dev *ca;
++	unsigned i;
++	bool first = true;
++
++	for_each_online_member(ca, c, i) {
++		if (!first)
++			seq_putc(seq, ':');
++		first = false;
++		seq_puts(seq, "/dev/");
++		seq_puts(seq, ca->name);
++	}
++
++	return 0;
++}
++
++static int bch2_show_options(struct seq_file *seq, struct dentry *root)
++{
++	struct bch_fs *c = root->d_sb->s_fs_info;
++	enum bch_opt_id i;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	for (i = 0; i < bch2_opts_nr; i++) {
++		const struct bch_option *opt = &bch2_opt_table[i];
++		u64 v = bch2_opt_get_by_id(&c->opts, i);
++
++		if (!(opt->flags & OPT_MOUNT))
++			continue;
++
++		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
++			continue;
++
++		printbuf_reset(&buf);
++		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
++				 OPT_SHOW_MOUNT_STYLE);
++		seq_putc(seq, ',');
++		seq_puts(seq, buf.buf);
++	}
++
++	if (buf.allocation_failure)
++		ret = -ENOMEM;
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static void bch2_put_super(struct super_block *sb)
++{
++	struct bch_fs *c = sb->s_fs_info;
++
++	__bch2_fs_stop(c);
++}
++
++static const struct super_operations bch_super_operations = {
++	.alloc_inode	= bch2_alloc_inode,
++	.destroy_inode	= bch2_destroy_inode,
++	.write_inode	= bch2_vfs_write_inode,
++	.evict_inode	= bch2_evict_inode,
++	.sync_fs	= bch2_sync_fs,
++	.statfs		= bch2_statfs,
++	.show_devname	= bch2_show_devname,
++	.show_options	= bch2_show_options,
++	.remount_fs	= bch2_remount,
++	.put_super	= bch2_put_super,
++#if 0
++	.freeze_fs	= bch2_freeze,
++	.unfreeze_fs	= bch2_unfreeze,
++#endif
++};
++
++static int bch2_set_super(struct super_block *s, void *data)
++{
++	s->s_fs_info = data;
++	return 0;
++}
++
++static int bch2_noset_super(struct super_block *s, void *data)
++{
++	return -EBUSY;
++}
++
++static int bch2_test_super(struct super_block *s, void *data)
++{
++	struct bch_fs *c = s->s_fs_info;
++	struct bch_fs **devs = data;
++	unsigned i;
++
++	if (!c)
++		return false;
++
++	for (i = 0; devs[i]; i++)
++		if (c != devs[i])
++			return false;
++	return true;
++}
++
++static struct dentry *bch2_mount(struct file_system_type *fs_type,
++				 int flags, const char *dev_name, void *data)
++{
++	struct bch_fs *c;
++	struct bch_dev *ca;
++	struct super_block *sb;
++	struct inode *vinode;
++	struct bch_opts opts = bch2_opts_empty();
++	char **devs;
++	struct bch_fs **devs_to_fs = NULL;
++	unsigned i, nr_devs;
++	int ret;
++
++	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
++
++	ret = bch2_parse_mount_opts(NULL, &opts, data);
++	if (ret)
++		return ERR_PTR(ret);
++
++	if (!dev_name || strlen(dev_name) == 0)
++		return ERR_PTR(-EINVAL);
++
++	devs = split_devs(dev_name, &nr_devs);
++	if (!devs)
++		return ERR_PTR(-ENOMEM);
++
++	devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
++	if (!devs_to_fs) {
++		sb = ERR_PTR(-ENOMEM);
++		goto got_sb;
++	}
++
++	for (i = 0; i < nr_devs; i++)
++		devs_to_fs[i] = bch2_path_to_fs(devs[i]);
++
++	sb = sget(fs_type, bch2_test_super, bch2_noset_super,
++		  flags|SB_NOSEC, devs_to_fs);
++	if (!IS_ERR(sb))
++		goto got_sb;
++
++	c = bch2_fs_open(devs, nr_devs, opts);
++	if (IS_ERR(c)) {
++		sb = ERR_CAST(c);
++		goto got_sb;
++	}
++
++	/* Some options can't be parsed until after the fs is started: */
++	ret = bch2_parse_mount_opts(c, &opts, data);
++	if (ret) {
++		bch2_fs_stop(c);
++		sb = ERR_PTR(ret);
++		goto got_sb;
++	}
++
++	bch2_opts_apply(&c->opts, opts);
++
++	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
++	if (IS_ERR(sb))
++		bch2_fs_stop(c);
++got_sb:
++	kfree(devs_to_fs);
++	kfree(devs[0]);
++	kfree(devs);
++
++	if (IS_ERR(sb))
++		return ERR_CAST(sb);
++
++	c = sb->s_fs_info;
++
++	if (sb->s_root) {
++		if ((flags ^ sb->s_flags) & SB_RDONLY) {
++			ret = -EBUSY;
++			goto err_put_super;
++		}
++		goto out;
++	}
++
++	sb->s_blocksize		= block_bytes(c);
++	sb->s_blocksize_bits	= ilog2(block_bytes(c));
++	sb->s_maxbytes		= MAX_LFS_FILESIZE;
++	sb->s_op		= &bch_super_operations;
++	sb->s_export_op		= &bch_export_ops;
++#ifdef CONFIG_BCACHEFS_QUOTA
++	sb->s_qcop		= &bch2_quotactl_operations;
++	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
++#endif
++	sb->s_xattr		= bch2_xattr_handlers;
++	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
++	sb->s_time_gran		= c->sb.nsec_per_time_unit;
++	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
++	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
++	c->vfs_sb		= sb;
++	strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
++
++	ret = super_setup_bdi(sb);
++	if (ret)
++		goto err_put_super;
++
++	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
++
++	for_each_online_member(ca, c, i) {
++		struct block_device *bdev = ca->disk_sb.bdev;
++
++		/* XXX: create an anonymous device for multi device filesystems */
++		sb->s_bdev	= bdev;
++		sb->s_dev	= bdev->bd_dev;
++		percpu_ref_put(&ca->io_ref);
++		break;
++	}
++
++	c->dev = sb->s_dev;
++
++#ifdef CONFIG_BCACHEFS_POSIX_ACL
++	if (c->opts.acl)
++		sb->s_flags	|= SB_POSIXACL;
++#endif
++
++	sb->s_shrink.seeks = 0;
++
++	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
++	ret = PTR_ERR_OR_ZERO(vinode);
++	if (ret) {
++		bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
++		goto err_put_super;
++	}
++
++	sb->s_root = d_make_root(vinode);
++	if (!sb->s_root) {
++		bch_err(c, "error mounting: error allocating root dentry");
++		ret = -ENOMEM;
++		goto err_put_super;
++	}
++
++	sb->s_flags |= SB_ACTIVE;
++out:
++	return dget(sb->s_root);
++
++err_put_super:
++	deactivate_locked_super(sb);
++	return ERR_PTR(ret);
++}
++
++static void bch2_kill_sb(struct super_block *sb)
++{
++	struct bch_fs *c = sb->s_fs_info;
++
++	generic_shutdown_super(sb);
++	bch2_fs_free(c);
++}
++
++static struct file_system_type bcache_fs_type = {
++	.owner		= THIS_MODULE,
++	.name		= "bcachefs",
++	.mount		= bch2_mount,
++	.kill_sb	= bch2_kill_sb,
++	.fs_flags	= FS_REQUIRES_DEV,
++};
++
++MODULE_ALIAS_FS("bcachefs");
++
++void bch2_vfs_exit(void)
++{
++	unregister_filesystem(&bcache_fs_type);
++	if (bch2_inode_cache)
++		kmem_cache_destroy(bch2_inode_cache);
++}
++
++int __init bch2_vfs_init(void)
++{
++	int ret = -ENOMEM;
++
++	bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
++	if (!bch2_inode_cache)
++		goto err;
++
++	ret = register_filesystem(&bcache_fs_type);
++	if (ret)
++		goto err;
++
++	return 0;
++err:
++	bch2_vfs_exit();
++	return ret;
++}
++
++#endif /* NO_BCACHEFS_FS */
+diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
+new file mode 100644
+index 000000000000..9f4b57e30e2a
+--- /dev/null
++++ b/fs/bcachefs/fs.h
+@@ -0,0 +1,208 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FS_H
++#define _BCACHEFS_FS_H
++
++#include "inode.h"
++#include "opts.h"
++#include "str_hash.h"
++#include "quota_types.h"
++
++#include <linux/seqlock.h>
++#include <linux/stat.h>
++
++/*
++ * Two-state lock - can be taken for add or block - both states are shared,
++ * like read side of rwsem, but conflict with other state:
++ */
++struct pagecache_lock {
++	atomic_long_t		v;
++	wait_queue_head_t	wait;
++};
++
++static inline void pagecache_lock_init(struct pagecache_lock *lock)
++{
++	atomic_long_set(&lock->v, 0);
++	init_waitqueue_head(&lock->wait);
++}
++
++void bch2_pagecache_add_put(struct pagecache_lock *);
++bool bch2_pagecache_add_tryget(struct pagecache_lock *);
++void bch2_pagecache_add_get(struct pagecache_lock *);
++void bch2_pagecache_block_put(struct pagecache_lock *);
++void bch2_pagecache_block_get(struct pagecache_lock *);
++
++struct bch_inode_info {
++	struct inode		v;
++	unsigned long		ei_flags;
++
++	struct mutex		ei_update_lock;
++	u64			ei_quota_reserved;
++	unsigned long		ei_last_dirtied;
++
++	struct pagecache_lock	ei_pagecache_lock;
++
++	struct mutex		ei_quota_lock;
++	struct bch_qid		ei_qid;
++
++	u32			ei_subvol;
++
++	/* copy of inode in btree: */
++	struct bch_inode_unpacked ei_inode;
++};
++
++static inline subvol_inum inode_inum(struct bch_inode_info *inode)
++{
++	return (subvol_inum) {
++		.subvol	= inode->ei_subvol,
++		.inum	= inode->ei_inode.bi_inum,
++	};
++}
++
++/*
++ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
++ * btree inode may be inconsistent:
++ */
++#define EI_INODE_ERROR			0
++
++/*
++ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
++ * those:
++ */
++#define EI_INODE_SNAPSHOT		1
++
++#define to_bch_ei(_inode)					\
++	container_of_or_null(_inode, struct bch_inode_info, v)
++
++static inline int ptrcmp(void *l, void *r)
++{
++	return cmp_int(l, r);
++}
++
++enum bch_inode_lock_op {
++	INODE_LOCK		= (1U << 0),
++	INODE_PAGECACHE_BLOCK	= (1U << 1),
++	INODE_UPDATE_LOCK	= (1U << 2),
++};
++
++#define bch2_lock_inodes(_locks, ...)					\
++do {									\
++	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
++	unsigned i;							\
++									\
++	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
++									\
++	for (i = 1; i < ARRAY_SIZE(a); i++)				\
++		if (a[i] != a[i - 1]) {					\
++			if ((_locks) & INODE_LOCK)			\
++				down_write_nested(&a[i]->v.i_rwsem, i);	\
++			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
++				bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
++			if ((_locks) & INODE_UPDATE_LOCK)			\
++				mutex_lock_nested(&a[i]->ei_update_lock, i);\
++		}							\
++} while (0)
++
++#define bch2_unlock_inodes(_locks, ...)					\
++do {									\
++	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
++	unsigned i;							\
++									\
++	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
++									\
++	for (i = 1; i < ARRAY_SIZE(a); i++)				\
++		if (a[i] != a[i - 1]) {					\
++			if ((_locks) & INODE_LOCK)			\
++				up_write(&a[i]->v.i_rwsem);		\
++			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
++				bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
++			if ((_locks) & INODE_UPDATE_LOCK)			\
++				mutex_unlock(&a[i]->ei_update_lock);	\
++		}							\
++} while (0)
++
++static inline struct bch_inode_info *file_bch_inode(struct file *file)
++{
++	return to_bch_ei(file_inode(file));
++}
++
++static inline bool inode_attr_changing(struct bch_inode_info *dir,
++				struct bch_inode_info *inode,
++				enum inode_opt_id id)
++{
++	return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
++		bch2_inode_opt_get(&dir->ei_inode, id) !=
++		bch2_inode_opt_get(&inode->ei_inode, id);
++}
++
++static inline bool inode_attrs_changing(struct bch_inode_info *dir,
++				 struct bch_inode_info *inode)
++{
++	unsigned id;
++
++	for (id = 0; id < Inode_opt_nr; id++)
++		if (inode_attr_changing(dir, inode, id))
++			return true;
++
++	return false;
++}
++
++struct bch_inode_unpacked;
++
++#ifndef NO_BCACHEFS_FS
++
++struct bch_inode_info *
++__bch2_create(struct user_namespace *, struct bch_inode_info *,
++	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
++
++int bch2_fs_quota_transfer(struct bch_fs *,
++			   struct bch_inode_info *,
++			   struct bch_qid,
++			   unsigned,
++			   enum quota_acct_mode);
++
++static inline int bch2_set_projid(struct bch_fs *c,
++				  struct bch_inode_info *inode,
++				  u32 projid)
++{
++	struct bch_qid qid = inode->ei_qid;
++
++	qid.q[QTYP_PRJ] = projid;
++
++	return bch2_fs_quota_transfer(c, inode, qid,
++				      1 << QTYP_PRJ,
++				      KEY_TYPE_QUOTA_PREALLOC);
++}
++
++struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
++
++/* returns 0 if we want to do the update, or error is passed up */
++typedef int (*inode_set_fn)(struct bch_inode_info *,
++			    struct bch_inode_unpacked *, void *);
++
++void bch2_inode_update_after_write(struct btree_trans *,
++				   struct bch_inode_info *,
++				   struct bch_inode_unpacked *,
++				   unsigned);
++int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
++				  inode_set_fn, void *, unsigned);
++
++int bch2_setattr_nonsize(struct user_namespace *,
++			 struct bch_inode_info *,
++			 struct iattr *);
++int __bch2_unlink(struct inode *, struct dentry *, bool);
++
++void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
++
++void bch2_vfs_exit(void);
++int bch2_vfs_init(void);
++
++#else
++
++static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
++					       snapshot_id_list *s) {}
++static inline void bch2_vfs_exit(void) {}
++static inline int bch2_vfs_init(void) { return 0; }
++
++#endif /* NO_BCACHEFS_FS */
++
++#endif /* _BCACHEFS_FS_H */
+diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
+new file mode 100644
+index 000000000000..bb8cab7cb405
+--- /dev/null
++++ b/fs/bcachefs/fsck.c
+@@ -0,0 +1,2390 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "bkey_buf.h"
++#include "btree_update.h"
++#include "darray.h"
++#include "dirent.h"
++#include "error.h"
++#include "fs-common.h"
++#include "fsck.h"
++#include "inode.h"
++#include "keylist.h"
++#include "subvolume.h"
++#include "super.h"
++#include "xattr.h"
++
++#include <linux/bsearch.h>
++#include <linux/dcache.h> /* struct qstr */
++
++#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
++
++static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
++				    u32 snapshot)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u64 sectors = 0;
++	int ret;
++
++	for_each_btree_key(trans, iter, BTREE_ID_extents,
++			   SPOS(inum, 0, snapshot), 0, k, ret) {
++		if (k.k->p.inode != inum)
++			break;
++
++		if (bkey_extent_is_allocation(k.k))
++			sectors += k.k->size;
++	}
++
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret ?: sectors;
++}
++
++static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
++				    u32 snapshot)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_s_c_dirent d;
++	u64 subdirs = 0;
++	int ret;
++
++	for_each_btree_key(trans, iter, BTREE_ID_dirents,
++			   SPOS(inum, 0, snapshot), 0, k, ret) {
++		if (k.k->p.inode != inum)
++			break;
++
++		if (k.k->type != KEY_TYPE_dirent)
++			continue;
++
++		d = bkey_s_c_to_dirent(k);
++		if (d.v->d_type == DT_DIR)
++			subdirs++;
++	}
++
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret ?: subdirs;
++}
++
++static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
++				    u32 *subvol)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
++			     POS(0, snapshot), 0);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_snapshot) {
++		bch_err(trans->c, "snapshot %u not fonud", snapshot);
++		ret = -ENOENT;
++		goto err;
++	}
++
++	*subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++
++}
++
++static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
++			   u32 *snapshot, u64 *inum)
++{
++	struct bch_subvolume s;
++	int ret;
++
++	ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
++
++	*snapshot = le32_to_cpu(s.snapshot);
++	*inum = le64_to_cpu(s.inode);
++	return ret;
++}
++
++static int subvol_lookup(struct btree_trans *trans, u32 subvol,
++			 u32 *snapshot, u64 *inum)
++{
++	return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
++}
++
++static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
++			      struct bch_inode_unpacked *inode)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
++			     POS(0, inode_nr),
++			     BTREE_ITER_ALL_SNAPSHOTS);
++	k = bch2_btree_iter_peek(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
++		ret = -ENOENT;
++		goto err;
++	}
++
++	ret = bch2_inode_unpack(k, inode);
++err:
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(trans->c, "error fetching inode %llu: %s",
++			inode_nr, bch2_err_str(ret));
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
++			  struct bch_inode_unpacked *inode,
++			  u32 *snapshot)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
++			     SPOS(0, inode_nr, *snapshot), 0);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	ret = bkey_is_inode(k.k)
++		? bch2_inode_unpack(k, inode)
++		: -ENOENT;
++	if (!ret)
++		*snapshot = iter.pos.snapshot;
++err:
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(trans->c, "error fetching inode %llu:%u: %s",
++			inode_nr, *snapshot, bch2_err_str(ret));
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
++			struct bch_inode_unpacked *inode,
++			u32 *snapshot)
++{
++	return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
++}
++
++static int __lookup_dirent(struct btree_trans *trans,
++			   struct bch_hash_info hash_info,
++			   subvol_inum dir, struct qstr *name,
++			   u64 *target, unsigned *type)
++{
++	struct btree_iter iter;
++	struct bkey_s_c_dirent d;
++	int ret;
++
++	ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
++			       &hash_info, dir, name, 0);
++	if (ret)
++		return ret;
++
++	d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
++	*target = le64_to_cpu(d.v->d_inum);
++	*type = d.v->d_type;
++	bch2_trans_iter_exit(trans, &iter);
++	return 0;
++}
++
++static int __write_inode(struct btree_trans *trans,
++			 struct bch_inode_unpacked *inode,
++			 u32 snapshot)
++{
++	struct btree_iter iter;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
++			    SPOS(0, inode->bi_inum, snapshot),
++			    BTREE_ITER_INTENT);
++
++	ret   = bch2_btree_iter_traverse(&iter) ?:
++		bch2_inode_write(trans, &iter, inode);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int write_inode(struct btree_trans *trans,
++		       struct bch_inode_unpacked *inode,
++		       u32 snapshot)
++{
++	int ret = commit_do(trans, NULL, NULL,
++				  BTREE_INSERT_NOFAIL|
++				  BTREE_INSERT_LAZY_RW,
++				  __write_inode(trans, inode, snapshot));
++	if (ret)
++		bch_err(trans->c, "error in fsck: error updating inode: %s",
++			bch2_err_str(ret));
++	return ret;
++}
++
++static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter = { NULL };
++	struct bkey_i_inode_generation delete;
++	struct bch_inode_unpacked inode_u;
++	struct bkey_s_c k;
++	int ret;
++
++	ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
++					      SPOS(inum, 0, snapshot),
++					      SPOS(inum, U64_MAX, snapshot),
++					      0, NULL) ?:
++		bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
++					      SPOS(inum, 0, snapshot),
++					      SPOS(inum, U64_MAX, snapshot),
++					      0, NULL) ?:
++		bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
++					      SPOS(inum, 0, snapshot),
++					      SPOS(inum, U64_MAX, snapshot),
++					      0, NULL);
++	if (ret)
++		goto err;
++retry:
++	bch2_trans_begin(trans);
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
++			     SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(&iter);
++
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (!bkey_is_inode(k.k)) {
++		bch2_fs_inconsistent(c,
++				     "inode %llu:%u not found when deleting",
++				     inum, snapshot);
++		ret = -EIO;
++		goto err;
++	}
++
++	bch2_inode_unpack(k, &inode_u);
++
++	/* Subvolume root? */
++	if (inode_u.bi_subvol)
++		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
++
++	bkey_inode_generation_init(&delete.k_i);
++	delete.k.p = iter.pos;
++	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
++
++	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
++		bch2_trans_commit(trans, NULL, NULL,
++				BTREE_INSERT_NOFAIL);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	return ret;
++}
++
++static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bch_inode_unpacked dir_inode;
++	struct bch_hash_info dir_hash_info;
++	int ret;
++
++	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
++	if (ret)
++		goto err;
++
++	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
++
++	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
++				  &dir_hash_info, &iter,
++				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++	bch2_trans_iter_exit(trans, &iter);
++err:
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++/* Get lost+found, create if it doesn't exist: */
++static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
++			    struct bch_inode_unpacked *lostfound)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_inode_unpacked root;
++	struct bch_hash_info root_hash_info;
++	struct qstr lostfound_str = QSTR("lost+found");
++	subvol_inum root_inum = { .subvol = subvol };
++	u64 inum = 0;
++	unsigned d_type = 0;
++	u32 snapshot;
++	int ret;
++
++	ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
++	if (ret)
++		return ret;
++
++	ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
++	if (ret)
++		return ret;
++
++	root_hash_info = bch2_hash_info_init(c, &root);
++
++	ret = __lookup_dirent(trans, root_hash_info, root_inum,
++			    &lostfound_str, &inum, &d_type);
++	if (ret == -ENOENT) {
++		bch_notice(c, "creating lost+found");
++		goto create_lostfound;
++	}
++
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
++	if (ret)
++		return ret;
++
++	if (d_type != DT_DIR) {
++		bch_err(c, "error looking up lost+found: not a directory");
++		return ret;
++	}
++
++	/*
++	 * The check_dirents pass has already run, dangling dirents
++	 * shouldn't exist here:
++	 */
++	return __lookup_inode(trans, inum, lostfound, &snapshot);
++
++create_lostfound:
++	bch2_inode_init_early(c, lostfound);
++
++	ret = bch2_create_trans(trans, root_inum, &root,
++				lostfound, &lostfound_str,
++				0, 0, S_IFDIR|0700, 0, NULL, NULL,
++				(subvol_inum) { }, 0);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
++	return ret;
++}
++
++static int __reattach_inode(struct btree_trans *trans,
++			  struct bch_inode_unpacked *inode,
++			  u32 inode_snapshot)
++{
++	struct bch_hash_info dir_hash;
++	struct bch_inode_unpacked lostfound;
++	char name_buf[20];
++	struct qstr name;
++	u64 dir_offset = 0;
++	u32 subvol;
++	int ret;
++
++	ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
++	if (ret)
++		return ret;
++
++	ret = lookup_lostfound(trans, subvol, &lostfound);
++	if (ret)
++		return ret;
++
++	if (S_ISDIR(inode->bi_mode)) {
++		lostfound.bi_nlink++;
++
++		ret = __write_inode(trans, &lostfound, U32_MAX);
++		if (ret)
++			return ret;
++	}
++
++	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
++
++	snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
++	name = (struct qstr) QSTR(name_buf);
++
++	ret = bch2_dirent_create(trans,
++				 (subvol_inum) {
++					.subvol = subvol,
++					.inum = lostfound.bi_inum,
++				 },
++				 &dir_hash,
++				 inode_d_type(inode),
++				 &name, inode->bi_inum, &dir_offset,
++				 BCH_HASH_SET_MUST_CREATE);
++	if (ret)
++		return ret;
++
++	inode->bi_dir		= lostfound.bi_inum;
++	inode->bi_dir_offset	= dir_offset;
++
++	return __write_inode(trans, inode, inode_snapshot);
++}
++
++static int reattach_inode(struct btree_trans *trans,
++			  struct bch_inode_unpacked *inode,
++			  u32 inode_snapshot)
++{
++	int ret = commit_do(trans, NULL, NULL,
++				  BTREE_INSERT_LAZY_RW|
++				  BTREE_INSERT_NOFAIL,
++			__reattach_inode(trans, inode, inode_snapshot));
++	if (ret) {
++		bch_err(trans->c, "error reattaching inode %llu: %s",
++			inode->bi_inum, bch2_err_str(ret));
++		return ret;
++	}
++
++	return ret;
++}
++
++static int remove_backpointer(struct btree_trans *trans,
++			      struct bch_inode_unpacked *inode)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
++			     POS(inode->bi_dir, inode->bi_dir_offset), 0);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto out;
++	if (k.k->type != KEY_TYPE_dirent) {
++		ret = -ENOENT;
++		goto out;
++	}
++
++	ret = __remove_dirent(trans, k.k->p);
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++struct snapshots_seen_entry {
++	u32				id;
++	u32				equiv;
++};
++
++struct snapshots_seen {
++	struct bpos			pos;
++	DARRAY(struct snapshots_seen_entry) ids;
++};
++
++static inline void snapshots_seen_exit(struct snapshots_seen *s)
++{
++	darray_exit(&s->ids);
++}
++
++static inline void snapshots_seen_init(struct snapshots_seen *s)
++{
++	memset(s, 0, sizeof(*s));
++}
++
++static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
++{
++	struct snapshots_seen_entry *i, n = { id, id };
++	int ret;
++
++	darray_for_each(s->ids, i) {
++		if (n.equiv < i->equiv)
++			break;
++
++		if (i->equiv == n.equiv) {
++			bch_err(c, "adding duplicate snapshot in snapshots_seen_add()");
++			return -EINVAL;
++		}
++	}
++
++	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
++	if (ret)
++		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
++			s->ids.size);
++	return ret;
++}
++
++static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
++				 enum btree_id btree_id, struct bpos pos)
++{
++	struct snapshots_seen_entry *i, n = {
++		.id	= pos.snapshot,
++		.equiv	= bch2_snapshot_equiv(c, pos.snapshot),
++	};
++	int ret;
++
++	if (bkey_cmp(s->pos, pos))
++		s->ids.nr = 0;
++
++	pos.snapshot = n.equiv;
++	s->pos = pos;
++
++	darray_for_each(s->ids, i)
++		if (i->equiv == n.equiv) {
++			if (i->id != n.id) {
++				bch_err(c, "snapshot deletion did not run correctly:\n"
++					"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
++					bch2_btree_ids[btree_id],
++					pos.inode, pos.offset,
++					i->id, n.id, n.equiv);
++				return -BCH_ERR_need_snapshot_cleanup;
++			}
++
++			return 0;
++		}
++
++	ret = darray_push(&s->ids, n);
++	if (ret)
++		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
++			s->ids.size);
++	return ret;
++}
++
++/**
++ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
++ * and @ancestor hasn't been overwritten in @seen
++ *
++ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
++ */
++static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
++				    u32 id, u32 ancestor)
++{
++	ssize_t i;
++	u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0;
++
++	BUG_ON(id > ancestor);
++	BUG_ON(!bch2_snapshot_is_equiv(c, id));
++	BUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
++
++	/* @ancestor should be the snapshot most recently added to @seen */
++	BUG_ON(ancestor != seen->pos.snapshot);
++	BUG_ON(ancestor != top);
++
++	if (id == ancestor)
++		return true;
++
++	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
++		return false;
++
++	for (i = seen->ids.nr - 2;
++	     i >= 0 && seen->ids.data[i].equiv >= id;
++	     --i)
++		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) &&
++		    bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor))
++			return false;
++
++	return true;
++}
++
++/**
++ * ref_visible - given a key with snapshot id @src that points to a key with
++ * snapshot id @dst, test whether there is some snapshot in which @dst is
++ * visible.
++ *
++ * This assumes we're visiting @src keys in natural key order.
++ *
++ * @s	- list of snapshot IDs already seen at @src
++ * @src	- snapshot ID of src key
++ * @dst	- snapshot ID of dst key
++ */
++static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
++		       u32 src, u32 dst)
++{
++	return dst <= src
++		? key_visible_in_snapshot(c, s, dst, src)
++		: bch2_snapshot_is_ancestor(c, src, dst);
++}
++
++#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
++	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
++	     (_i)->snapshot <= (_snapshot); _i++)					\
++		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
++
++struct inode_walker_entry {
++	struct bch_inode_unpacked inode;
++	u32			snapshot;
++	u64			count;
++};
++
++struct inode_walker {
++	bool				first_this_inode;
++	u64				cur_inum;
++
++	DARRAY(struct inode_walker_entry) inodes;
++};
++
++static void inode_walker_exit(struct inode_walker *w)
++{
++	darray_exit(&w->inodes);
++}
++
++static struct inode_walker inode_walker_init(void)
++{
++	return (struct inode_walker) { 0, };
++}
++
++static int add_inode(struct bch_fs *c, struct inode_walker *w,
++		     struct bkey_s_c inode)
++{
++	struct bch_inode_unpacked u;
++
++	BUG_ON(bch2_inode_unpack(inode, &u));
++
++	return darray_push(&w->inodes, ((struct inode_walker_entry) {
++		.inode		= u,
++		.snapshot	= bch2_snapshot_equiv(c, inode.k->p.snapshot),
++	}));
++}
++
++static int __walk_inode(struct btree_trans *trans,
++			struct inode_walker *w, struct bpos pos)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u32 restart_count = trans->restart_count;
++	unsigned i;
++	int ret;
++
++	pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot);
++
++	if (pos.inode == w->cur_inum) {
++		w->first_this_inode = false;
++		goto lookup_snapshot;
++	}
++
++	w->inodes.nr = 0;
++
++	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
++			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++		if (k.k->p.offset != pos.inode)
++			break;
++
++		if (bkey_is_inode(k.k))
++			add_inode(c, w, k);
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	if (ret)
++		return ret;
++
++	w->cur_inum		= pos.inode;
++	w->first_this_inode	= true;
++
++	if (trans_was_restarted(trans, restart_count))
++		return -BCH_ERR_transaction_restart_nested;
++
++lookup_snapshot:
++	for (i = 0; i < w->inodes.nr; i++)
++		if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
++			goto found;
++	return INT_MAX;
++found:
++	BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
++
++	if (pos.snapshot != w->inodes.data[i].snapshot) {
++		struct inode_walker_entry e = w->inodes.data[i];
++
++		e.snapshot = pos.snapshot;
++		e.count = 0;
++
++		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
++			 pos.inode, pos.snapshot, w->inodes.data[i].snapshot);
++
++		while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
++			--i;
++
++		ret = darray_insert_item(&w->inodes, i, e);
++		if (ret)
++			return ret;
++	}
++
++	return i;
++}
++
++static int __get_visible_inodes(struct btree_trans *trans,
++				struct inode_walker *w,
++				struct snapshots_seen *s,
++				u64 inum)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	w->inodes.nr = 0;
++
++	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
++			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++		u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
++
++		if (k.k->p.offset != inum)
++			break;
++
++		if (!ref_visible(c, s, s->pos.snapshot, equiv))
++			continue;
++
++		if (bkey_is_inode(k.k))
++			add_inode(c, w, k);
++
++		if (equiv >= s->pos.snapshot)
++			break;
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++static int check_key_has_snapshot(struct btree_trans *trans,
++				  struct btree_iter *iter,
++				  struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
++			"key in missing snapshot: %s",
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
++		ret = bch2_btree_delete_at(trans, iter,
++					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int hash_redo_key(struct btree_trans *trans,
++			 const struct bch_hash_desc desc,
++			 struct bch_hash_info *hash_info,
++			 struct btree_iter *k_iter, struct bkey_s_c k)
++{
++	bch_err(trans->c, "hash_redo_key() not implemented yet");
++	return -EINVAL;
++#if 0
++	struct bkey_i *delete;
++	struct bkey_i *tmp;
++
++	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
++	if (IS_ERR(delete))
++		return PTR_ERR(delete);
++
++	tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	if (IS_ERR(tmp))
++		return PTR_ERR(tmp);
++
++	bkey_reassemble(tmp, k);
++
++	bkey_init(&delete->k);
++	delete->k.p = k_iter->pos;
++	return  bch2_btree_iter_traverse(k_iter) ?:
++		bch2_trans_update(trans, k_iter, delete, 0) ?:
++		bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
++#endif
++}
++
++static int hash_check_key(struct btree_trans *trans,
++			  const struct bch_hash_desc desc,
++			  struct bch_hash_info *hash_info,
++			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter = { NULL };
++	struct printbuf buf = PRINTBUF;
++	struct bkey_s_c k;
++	u64 hash;
++	int ret = 0;
++
++	if (hash_k.k->type != desc.key_type)
++		return 0;
++
++	hash = desc.hash_bkey(hash_info, hash_k);
++
++	if (likely(hash == hash_k.k->p.offset))
++		return 0;
++
++	if (hash_k.k->p.offset < hash)
++		goto bad_hash;
++
++	for_each_btree_key_norestart(trans, iter, desc.btree_id,
++				     POS(hash_k.k->p.inode, hash),
++				     BTREE_ITER_SLOTS, k, ret) {
++		if (!bkey_cmp(k.k->p, hash_k.k->p))
++			break;
++
++		if (fsck_err_on(k.k->type == desc.key_type &&
++				!desc.cmp_bkey(k, hash_k), c,
++				"duplicate hash table keys:\n%s",
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, hash_k),
++				 buf.buf))) {
++			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
++			break;
++		}
++
++		if (bkey_deleted(k.k)) {
++			bch2_trans_iter_exit(trans, &iter);
++			goto bad_hash;
++		}
++	}
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
++	return ret;
++bad_hash:
++	if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, "
++		     "hashed to %llu\n%s",
++		     bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
++		     (printbuf_reset(&buf),
++		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
++		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
++		if (ret) {
++			bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
++			return ret;
++		}
++		ret = -BCH_ERR_transaction_restart_nested;
++	}
++fsck_err:
++	goto out;
++}
++
++static int check_inode(struct btree_trans *trans,
++		       struct btree_iter *iter,
++		       struct bkey_s_c k,
++		       struct bch_inode_unpacked *prev,
++		       struct snapshots_seen *s,
++		       bool full)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_inode_unpacked u;
++	bool do_update = false;
++	int ret;
++
++	ret = check_key_has_snapshot(trans, iter, k);
++	if (ret < 0)
++		goto err;
++	if (ret)
++		return 0;
++
++	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
++	if (ret)
++		goto err;
++
++	/*
++	 * if snapshot id isn't a leaf node, skip it - deletion in
++	 * particular is not atomic, so on the internal snapshot nodes
++	 * we can see inodes marked for deletion after a clean shutdown
++	 */
++	if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
++		return 0;
++
++	if (!bkey_is_inode(k.k))
++		return 0;
++
++	BUG_ON(bch2_inode_unpack(k, &u));
++
++	if (!full &&
++	    !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
++			    BCH_INODE_I_SECTORS_DIRTY|
++			    BCH_INODE_UNLINKED)))
++		return 0;
++
++	if (prev->bi_inum != u.bi_inum)
++		*prev = u;
++
++	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
++			inode_d_type(prev)	!= inode_d_type(&u), c,
++			"inodes in different snapshots don't match")) {
++		bch_err(c, "repair not implemented yet");
++		return -EINVAL;
++	}
++
++	if (u.bi_flags & BCH_INODE_UNLINKED &&
++	    (!c->sb.clean ||
++	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
++		      u.bi_inum))) {
++		bch2_trans_unlock(trans);
++		bch2_fs_lazy_rw(c);
++
++		ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
++		if (ret)
++			bch_err(c, "error in fsck: error while deleting inode: %s",
++				bch2_err_str(ret));
++		return ret;
++	}
++
++	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
++	    (!c->sb.clean ||
++	     fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
++		      u.bi_inum))) {
++		bch_verbose(c, "truncating inode %llu", u.bi_inum);
++
++		bch2_trans_unlock(trans);
++		bch2_fs_lazy_rw(c);
++
++		/*
++		 * XXX: need to truncate partial blocks too here - or ideally
++		 * just switch units to bytes and that issue goes away
++		 */
++		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
++				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
++				     iter->pos.snapshot),
++				POS(u.bi_inum, U64_MAX),
++				0, NULL);
++		if (ret) {
++			bch_err(c, "error in fsck: error truncating inode: %s",
++				bch2_err_str(ret));
++			return ret;
++		}
++
++		/*
++		 * We truncated without our normal sector accounting hook, just
++		 * make sure we recalculate it:
++		 */
++		u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
++
++		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
++		do_update = true;
++	}
++
++	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
++	    (!c->sb.clean ||
++	     fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
++		      u.bi_inum))) {
++		s64 sectors;
++
++		bch_verbose(c, "recounting sectors for inode %llu",
++			    u.bi_inum);
++
++		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
++		if (sectors < 0) {
++			bch_err(c, "error in fsck: error recounting inode sectors: %s",
++				bch2_err_str(sectors));
++			return sectors;
++		}
++
++		u.bi_sectors = sectors;
++		u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
++		do_update = true;
++	}
++
++	if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
++		u.bi_dir = 0;
++		u.bi_dir_offset = 0;
++		u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
++		do_update = true;
++	}
++
++	if (do_update) {
++		ret = __write_inode(trans, &u, iter->pos.snapshot);
++		if (ret)
++			bch_err(c, "error in fsck: error updating inode: %s",
++				bch2_err_str(ret));
++	}
++err:
++fsck_err:
++	if (ret)
++		bch_err(c, "error from check_inode(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++noinline_for_stack
++static int check_inodes(struct bch_fs *c, bool full)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bch_inode_unpacked prev = { 0 };
++	struct snapshots_seen s;
++	struct bkey_s_c k;
++	int ret;
++
++	snapshots_seen_init(&s);
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
++			POS_MIN,
++			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_inode(&trans, &iter, k, &prev, &s, full));
++
++	bch2_trans_exit(&trans);
++	snapshots_seen_exit(&s);
++	if (ret)
++		bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++/*
++ * Checking for overlapping extents needs to be reimplemented
++ */
++#if 0
++static int fix_overlapping_extent(struct btree_trans *trans,
++				       struct bkey_s_c k, struct bpos cut_at)
++{
++	struct btree_iter iter;
++	struct bkey_i *u;
++	int ret;
++
++	u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(u);
++	if (ret)
++		return ret;
++
++	bkey_reassemble(u, k);
++	bch2_cut_front(cut_at, u);
++
++
++	/*
++	 * We don't want to go through the extent_handle_overwrites path:
++	 *
++	 * XXX: this is going to screw up disk accounting, extent triggers
++	 * assume things about extent overwrites - we should be running the
++	 * triggers manually here
++	 */
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
++			     BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
++
++	BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
++	ret   = bch2_btree_iter_traverse(&iter) ?:
++		bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
++		bch2_trans_commit(trans, NULL, NULL,
++				  BTREE_INSERT_NOFAIL|
++				  BTREE_INSERT_LAZY_RW);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++#endif
++
++static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
++						struct btree_iter *iter,
++						struct bpos pos)
++{
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (!ret && k.k->type != KEY_TYPE_dirent)
++		ret = -ENOENT;
++	if (ret) {
++		bch2_trans_iter_exit(trans, iter);
++		return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
++	}
++
++	return bkey_s_c_to_dirent(k);
++}
++
++static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
++				   struct bkey_s_c_dirent d)
++{
++	return  inode->bi_dir		== d.k->p.inode &&
++		inode->bi_dir_offset	== d.k->p.offset;
++}
++
++static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
++				   struct bch_inode_unpacked *inode)
++{
++	return d.v->d_type == DT_SUBVOL
++		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
++		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
++}
++
++static int inode_backpointer_exists(struct btree_trans *trans,
++				    struct bch_inode_unpacked *inode,
++				    u32 snapshot)
++{
++	struct btree_iter iter;
++	struct bkey_s_c_dirent d;
++	int ret;
++
++	d = dirent_get_by_pos(trans, &iter,
++			SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
++	ret = bkey_err(d.s_c);
++	if (ret)
++		return ret == -ENOENT ? 0 : ret;
++
++	ret = dirent_points_to_inode(d, inode);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
++{
++	struct bch_fs *c = trans->c;
++	struct inode_walker_entry *i;
++	u32 restart_count = trans->restart_count;
++	int ret = 0;
++	s64 count2;
++
++	darray_for_each(w->inodes, i) {
++		if (i->inode.bi_sectors == i->count)
++			continue;
++
++		count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot);
++
++		if (i->count != count2) {
++			bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
++				i->count, count2);
++			i->count = count2;
++			if (i->inode.bi_sectors == i->count)
++				continue;
++		}
++
++		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
++			    "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
++			    w->cur_inum, i->snapshot,
++			    i->inode.bi_sectors, i->count)) {
++			i->inode.bi_sectors = i->count;
++			ret = write_inode(trans, &i->inode, i->snapshot);
++			if (ret)
++				break;
++		}
++	}
++fsck_err:
++	if (ret) {
++		bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret));
++		return ret;
++	}
++	if (trans_was_restarted(trans, restart_count))
++		return -BCH_ERR_transaction_restart_nested;
++	return 0;
++}
++
++static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
++			struct bkey_s_c k,
++			struct inode_walker *inode,
++			struct snapshots_seen *s)
++{
++	struct bch_fs *c = trans->c;
++	struct inode_walker_entry *i;
++	struct printbuf buf = PRINTBUF;
++	struct bpos equiv;
++	int ret = 0;
++
++	ret = check_key_has_snapshot(trans, iter, k);
++	if (ret) {
++		ret = ret < 0 ? ret : 0;
++		goto out;
++	}
++
++	equiv = k.k->p;
++	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
++
++	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
++	if (ret)
++		goto err;
++
++	if (k.k->type == KEY_TYPE_whiteout)
++		goto out;
++
++	if (inode->cur_inum != k.k->p.inode) {
++		ret = check_i_sectors(trans, inode);
++		if (ret)
++			goto err;
++	}
++
++	BUG_ON(!iter->path->should_be_locked);
++#if 0
++	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
++		char buf1[200];
++		char buf2[200];
++
++		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
++		bch2_bkey_val_to_text(&PBUF(buf2), c, k);
++
++		if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
++			ret = fix_overlapping_extent(trans, k, prev.k->k.p)
++				?: -BCH_ERR_transaction_restart_nested;
++			goto out;
++		}
++	}
++#endif
++	ret = __walk_inode(trans, inode, equiv);
++	if (ret < 0)
++		goto err;
++
++	if (fsck_err_on(ret == INT_MAX, c,
++			"extent in missing inode:\n  %s",
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, iter,
++					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++		goto out;
++	}
++
++	if (ret == INT_MAX) {
++		ret = 0;
++		goto out;
++	}
++
++	i = inode->inodes.data + ret;
++	ret = 0;
++
++	if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
++			!S_ISLNK(i->inode.bi_mode), c,
++			"extent in non regular inode mode %o:\n  %s",
++			i->inode.bi_mode,
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, iter,
++					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++		goto out;
++	}
++
++	/*
++	 * Check inodes in reverse order, from oldest snapshots to newest, so
++	 * that we emit the fewest number of whiteouts necessary:
++	 */
++	for (i = inode->inodes.data + inode->inodes.nr - 1;
++	     i >= inode->inodes.data;
++	     --i) {
++		if (i->snapshot > equiv.snapshot ||
++		    !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
++			continue;
++
++		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
++				k.k->type != KEY_TYPE_reservation &&
++				k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
++				"extent type past end of inode %llu:%u, i_size %llu\n  %s",
++				i->inode.bi_inum, i->snapshot, i->inode.bi_size,
++				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++			struct btree_iter iter2;
++
++			bch2_trans_copy_iter(&iter2, iter);
++			bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
++			ret =   bch2_btree_iter_traverse(&iter2) ?:
++				bch2_btree_delete_at(trans, &iter2,
++					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++			bch2_trans_iter_exit(trans, &iter2);
++			if (ret)
++				goto err;
++
++			if (i->snapshot != equiv.snapshot) {
++				ret = snapshots_seen_add(c, s, i->snapshot);
++				if (ret)
++					goto err;
++			}
++		}
++	}
++
++	if (bkey_extent_is_allocation(k.k))
++		for_each_visible_inode(c, s, inode, equiv.snapshot, i)
++			i->count += k.k->size;
++#if 0
++	bch2_bkey_buf_reassemble(&prev, c, k);
++#endif
++
++out:
++err:
++fsck_err:
++	printbuf_exit(&buf);
++
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from check_extent(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++/*
++ * Walk extents: verify that extents have a corresponding S_ISREG inode, and
++ * that i_size an i_sectors are consistent
++ */
++noinline_for_stack
++static int check_extents(struct bch_fs *c)
++{
++	struct inode_walker w = inode_walker_init();
++	struct snapshots_seen s;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++#if 0
++	struct bkey_buf prev;
++	bch2_bkey_buf_init(&prev);
++	prev.k->k = KEY(0, 0, 0);
++#endif
++	snapshots_seen_init(&s);
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	bch_verbose(c, "checking extents");
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
++			POS(BCACHEFS_ROOT_INO, 0),
++			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++			NULL, NULL,
++			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_extent(&trans, &iter, k, &w, &s));
++#if 0
++	bch2_bkey_buf_exit(&prev, c);
++#endif
++	inode_walker_exit(&w);
++	bch2_trans_exit(&trans);
++	snapshots_seen_exit(&s);
++
++	if (ret)
++		bch_err(c, "error from check_extents(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
++{
++	struct bch_fs *c = trans->c;
++	struct inode_walker_entry *i;
++	u32 restart_count = trans->restart_count;
++	int ret = 0;
++	s64 count2;
++
++	darray_for_each(w->inodes, i) {
++		if (i->inode.bi_nlink == i->count)
++			continue;
++
++		count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
++		if (count2 < 0)
++			return count2;
++
++		if (i->count != count2) {
++			bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
++				i->count, count2);
++			i->count = count2;
++			if (i->inode.bi_nlink == i->count)
++				continue;
++		}
++
++		if (fsck_err_on(i->inode.bi_nlink != i->count, c,
++				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
++				w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
++			i->inode.bi_nlink = i->count;
++			ret = write_inode(trans, &i->inode, i->snapshot);
++			if (ret)
++				break;
++		}
++	}
++fsck_err:
++	if (ret) {
++		bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret));
++		return ret;
++	}
++	if (trans_was_restarted(trans, restart_count))
++		return -BCH_ERR_transaction_restart_nested;
++	return 0;
++}
++
++static int check_dirent_target(struct btree_trans *trans,
++			       struct btree_iter *iter,
++			       struct bkey_s_c_dirent d,
++			       struct bch_inode_unpacked *target,
++			       u32 target_snapshot)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_i_dirent *n;
++	bool backpointer_exists = true;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	if (!target->bi_dir &&
++	    !target->bi_dir_offset) {
++		target->bi_dir		= d.k->p.inode;
++		target->bi_dir_offset	= d.k->p.offset;
++
++		ret = __write_inode(trans, target, target_snapshot);
++		if (ret)
++			goto err;
++	}
++
++	if (!inode_points_to_dirent(target, d)) {
++		ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
++		if (ret < 0)
++			goto err;
++
++		backpointer_exists = ret;
++		ret = 0;
++
++		if (fsck_err_on(S_ISDIR(target->bi_mode) &&
++				backpointer_exists, c,
++				"directory %llu with multiple links",
++				target->bi_inum)) {
++			ret = __remove_dirent(trans, d.k->p);
++			goto out;
++		}
++
++		if (fsck_err_on(backpointer_exists &&
++				!target->bi_nlink, c,
++				"inode %llu type %s has multiple links but i_nlink 0",
++				target->bi_inum, bch2_d_types[d.v->d_type])) {
++			target->bi_nlink++;
++			target->bi_flags &= ~BCH_INODE_UNLINKED;
++
++			ret = __write_inode(trans, target, target_snapshot);
++			if (ret)
++				goto err;
++		}
++
++		if (fsck_err_on(!backpointer_exists, c,
++				"inode %llu:%u has wrong backpointer:\n"
++				"got       %llu:%llu\n"
++				"should be %llu:%llu",
++				target->bi_inum, target_snapshot,
++				target->bi_dir,
++				target->bi_dir_offset,
++				d.k->p.inode,
++				d.k->p.offset)) {
++			target->bi_dir		= d.k->p.inode;
++			target->bi_dir_offset	= d.k->p.offset;
++
++			ret = __write_inode(trans, target, target_snapshot);
++			if (ret)
++				goto err;
++		}
++	}
++
++	if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
++			"incorrect d_type: got %s, should be %s:\n%s",
++			bch2_d_type_str(d.v->d_type),
++			bch2_d_type_str(inode_d_type(target)),
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
++		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
++		ret = PTR_ERR_OR_ZERO(n);
++		if (ret)
++			goto err;
++
++		bkey_reassemble(&n->k_i, d.s_c);
++		n->v.d_type = inode_d_type(target);
++
++		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
++		if (ret)
++			goto err;
++
++		d = dirent_i_to_s_c(n);
++	}
++
++	if (d.v->d_type == DT_SUBVOL &&
++	    target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
++	    (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
++	     fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
++		      le32_to_cpu(d.v->d_parent_subvol),
++		      target->bi_parent_subvol))) {
++		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
++		ret = PTR_ERR_OR_ZERO(n);
++		if (ret)
++			goto err;
++
++		bkey_reassemble(&n->k_i, d.s_c);
++		n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
++
++		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
++		if (ret)
++			goto err;
++
++		d = dirent_i_to_s_c(n);
++	}
++out:
++err:
++fsck_err:
++	printbuf_exit(&buf);
++
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from check_target(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
++			struct bkey_s_c k,
++			struct bch_hash_info *hash_info,
++			struct inode_walker *dir,
++			struct inode_walker *target,
++			struct snapshots_seen *s)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c_dirent d;
++	struct inode_walker_entry *i;
++	struct printbuf buf = PRINTBUF;
++	struct bpos equiv;
++	int ret = 0;
++
++	ret = check_key_has_snapshot(trans, iter, k);
++	if (ret) {
++		ret = ret < 0 ? ret : 0;
++		goto out;
++	}
++
++	equiv = k.k->p;
++	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
++
++	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
++	if (ret)
++		goto err;
++
++	if (k.k->type == KEY_TYPE_whiteout)
++		goto out;
++
++	if (dir->cur_inum != k.k->p.inode) {
++		ret = check_subdir_count(trans, dir);
++		if (ret)
++			goto err;
++	}
++
++	BUG_ON(!iter->path->should_be_locked);
++
++	ret = __walk_inode(trans, dir, equiv);
++	if (ret < 0)
++		goto err;
++
++	if (fsck_err_on(ret == INT_MAX, c,
++			"dirent in nonexisting directory:\n%s",
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, iter,
++				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++		goto out;
++	}
++
++	if (ret == INT_MAX) {
++		ret = 0;
++		goto out;
++	}
++
++	i = dir->inodes.data + ret;
++	ret = 0;
++
++	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
++			"dirent in non directory inode type %s:\n%s",
++			bch2_d_type_str(inode_d_type(&i->inode)),
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, iter, 0);
++		goto out;
++	}
++
++	if (dir->first_this_inode)
++		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
++
++	ret = hash_check_key(trans, bch2_dirent_hash_desc,
++			     hash_info, iter, k);
++	if (ret < 0)
++		goto err;
++	if (ret) {
++		/* dirent has been deleted */
++		ret = 0;
++		goto out;
++	}
++
++	if (k.k->type != KEY_TYPE_dirent)
++		goto out;
++
++	d = bkey_s_c_to_dirent(k);
++
++	if (d.v->d_type == DT_SUBVOL) {
++		struct bch_inode_unpacked subvol_root;
++		u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
++		u32 target_snapshot;
++		u64 target_inum;
++
++		ret = __subvol_lookup(trans, target_subvol,
++				      &target_snapshot, &target_inum);
++		if (ret && ret != -ENOENT)
++			goto err;
++
++		if (fsck_err_on(ret, c,
++				"dirent points to missing subvolume %llu",
++				le64_to_cpu(d.v->d_child_subvol))) {
++			ret = __remove_dirent(trans, d.k->p);
++			goto err;
++		}
++
++		ret = __lookup_inode(trans, target_inum,
++				   &subvol_root, &target_snapshot);
++		if (ret && ret != -ENOENT)
++			goto err;
++
++		if (fsck_err_on(ret, c,
++				"subvolume %u points to missing subvolume root %llu",
++				target_subvol,
++				target_inum)) {
++			bch_err(c, "repair not implemented yet");
++			ret = -EINVAL;
++			goto err;
++		}
++
++		if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
++				"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
++				target_inum,
++				subvol_root.bi_subvol, target_subvol)) {
++			subvol_root.bi_subvol = target_subvol;
++			ret = __write_inode(trans, &subvol_root, target_snapshot);
++			if (ret)
++				goto err;
++		}
++
++		ret = check_dirent_target(trans, iter, d, &subvol_root,
++					  target_snapshot);
++		if (ret)
++			goto err;
++	} else {
++		ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
++		if (ret)
++			goto err;
++
++		if (fsck_err_on(!target->inodes.nr, c,
++				"dirent points to missing inode: (equiv %u)\n%s",
++				equiv.snapshot,
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, k),
++				 buf.buf))) {
++			ret = __remove_dirent(trans, d.k->p);
++			if (ret)
++				goto err;
++		}
++
++		darray_for_each(target->inodes, i) {
++			ret = check_dirent_target(trans, iter, d,
++						  &i->inode, i->snapshot);
++			if (ret)
++				goto err;
++		}
++	}
++
++	if (d.v->d_type == DT_DIR)
++		for_each_visible_inode(c, s, dir, equiv.snapshot, i)
++			i->count++;
++
++out:
++err:
++fsck_err:
++	printbuf_exit(&buf);
++
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++/*
++ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
++ * validate d_type
++ */
++noinline_for_stack
++static int check_dirents(struct bch_fs *c)
++{
++	struct inode_walker dir = inode_walker_init();
++	struct inode_walker target = inode_walker_init();
++	struct snapshots_seen s;
++	struct bch_hash_info hash_info;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch_verbose(c, "checking dirents");
++
++	snapshots_seen_init(&s);
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
++			POS(BCACHEFS_ROOT_INO, 0),
++			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
++			k,
++			NULL, NULL,
++			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
++
++	bch2_trans_exit(&trans);
++	snapshots_seen_exit(&s);
++	inode_walker_exit(&dir);
++	inode_walker_exit(&target);
++
++	if (ret)
++		bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
++		       struct bkey_s_c k,
++		       struct bch_hash_info *hash_info,
++		       struct inode_walker *inode)
++{
++	struct bch_fs *c = trans->c;
++	int ret;
++
++	ret = check_key_has_snapshot(trans, iter, k);
++	if (ret)
++		return ret;
++
++	ret = __walk_inode(trans, inode, k.k->p);
++	if (ret < 0)
++		return ret;
++
++	if (fsck_err_on(ret == INT_MAX, c,
++			"xattr for missing inode %llu",
++			k.k->p.inode))
++		return bch2_btree_delete_at(trans, iter, 0);
++
++	if (ret == INT_MAX)
++		return 0;
++
++	ret = 0;
++
++	if (inode->first_this_inode)
++		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
++
++	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
++fsck_err:
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++/*
++ * Walk xattrs: verify that they all have a corresponding inode
++ */
++noinline_for_stack
++static int check_xattrs(struct bch_fs *c)
++{
++	struct inode_walker inode = inode_walker_init();
++	struct bch_hash_info hash_info;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch_verbose(c, "checking xattrs");
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
++			POS(BCACHEFS_ROOT_INO, 0),
++			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
++			k,
++			NULL, NULL,
++			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_xattr(&trans, &iter, k, &hash_info, &inode));
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret));
++	return ret;
++}
++
++static int check_root_trans(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_inode_unpacked root_inode;
++	u32 snapshot;
++	u64 inum;
++	int ret;
++
++	ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
++	if (ret && ret != -ENOENT)
++		return ret;
++
++	if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
++		struct bkey_i_subvolume root_subvol;
++
++		snapshot	= U32_MAX;
++		inum		= BCACHEFS_ROOT_INO;
++
++		bkey_subvolume_init(&root_subvol.k_i);
++		root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
++		root_subvol.v.flags	= 0;
++		root_subvol.v.snapshot	= cpu_to_le32(snapshot);
++		root_subvol.v.inode	= cpu_to_le64(inum);
++		ret = commit_do(trans, NULL, NULL,
++				      BTREE_INSERT_NOFAIL|
++				      BTREE_INSERT_LAZY_RW,
++			__bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
++		if (ret) {
++			bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
++			goto err;
++		}
++
++	}
++
++	ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
++	if (ret && ret != -ENOENT)
++		return ret;
++
++	if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
++	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
++				"root inode not a directory")) {
++		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
++				0, NULL);
++		root_inode.bi_inum = inum;
++
++		ret = __write_inode(trans, &root_inode, snapshot);
++		if (ret)
++			bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
++	}
++err:
++fsck_err:
++	return ret;
++}
++
++/* Get root directory, create if it doesn't exist: */
++noinline_for_stack
++static int check_root(struct bch_fs *c)
++{
++	bch_verbose(c, "checking root directory");
++
++	return bch2_trans_do(c, NULL, NULL,
++			     BTREE_INSERT_NOFAIL|
++			     BTREE_INSERT_LAZY_RW,
++		check_root_trans(&trans));
++}
++
++struct pathbuf_entry {
++	u64	inum;
++	u32	snapshot;
++};
++
++typedef DARRAY(struct pathbuf_entry) pathbuf;
++
++static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
++{
++	struct pathbuf_entry *i;
++
++	darray_for_each(*p, i)
++		if (i->inum	== inum &&
++		    i->snapshot	== snapshot)
++			return true;
++
++	return false;
++}
++
++static int path_down(struct bch_fs *c, pathbuf *p,
++		     u64 inum, u32 snapshot)
++{
++	int ret = darray_push(p, ((struct pathbuf_entry) {
++		.inum		= inum,
++		.snapshot	= snapshot,
++	}));
++
++	if (ret)
++		bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
++			p->size);
++	return ret;
++}
++
++/*
++ * Check that a given inode is reachable from the root:
++ *
++ * XXX: we should also be verifying that inodes are in the right subvolumes
++ */
++static int check_path(struct btree_trans *trans,
++		      pathbuf *p,
++		      struct bch_inode_unpacked *inode,
++		      u32 snapshot)
++{
++	struct bch_fs *c = trans->c;
++	int ret = 0;
++
++	snapshot = bch2_snapshot_equiv(c, snapshot);
++	p->nr = 0;
++
++	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
++		 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
++		struct btree_iter dirent_iter;
++		struct bkey_s_c_dirent d;
++		u32 parent_snapshot = snapshot;
++
++		if (inode->bi_subvol) {
++			u64 inum;
++
++			ret = subvol_lookup(trans, inode->bi_parent_subvol,
++					    &parent_snapshot, &inum);
++			if (ret)
++				break;
++		}
++
++		ret = lockrestart_do(trans,
++			PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
++					  SPOS(inode->bi_dir, inode->bi_dir_offset,
++					       parent_snapshot))).k));
++		if (ret && ret != -ENOENT)
++			break;
++
++		if (!ret && !dirent_points_to_inode(d, inode)) {
++			bch2_trans_iter_exit(trans, &dirent_iter);
++			ret = -ENOENT;
++		}
++
++		if (ret == -ENOENT) {
++			if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
++				     inode->bi_inum, snapshot,
++				     bch2_d_type_str(inode_d_type(inode)),
++				     inode->bi_nlink,
++				     inode->bi_dir,
++				     inode->bi_dir_offset))
++				ret = reattach_inode(trans, inode, snapshot);
++			break;
++		}
++
++		bch2_trans_iter_exit(trans, &dirent_iter);
++
++		if (!S_ISDIR(inode->bi_mode))
++			break;
++
++		ret = path_down(c, p, inode->bi_inum, snapshot);
++		if (ret) {
++			bch_err(c, "memory allocation failure");
++			return ret;
++		}
++
++		snapshot = parent_snapshot;
++
++		ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
++		if (ret) {
++			/* Should have been caught in dirents pass */
++			bch_err(c, "error looking up parent directory: %i", ret);
++			break;
++		}
++
++		if (path_is_dup(p, inode->bi_inum, snapshot)) {
++			struct pathbuf_entry *i;
++
++			/* XXX print path */
++			bch_err(c, "directory structure loop");
++
++			darray_for_each(*p, i)
++				pr_err("%llu:%u", i->inum, i->snapshot);
++			pr_err("%llu:%u", inode->bi_inum, snapshot);
++
++			if (!fsck_err(c, "directory structure loop"))
++				return 0;
++
++			ret = commit_do(trans, NULL, NULL,
++					      BTREE_INSERT_NOFAIL|
++					      BTREE_INSERT_LAZY_RW,
++					remove_backpointer(trans, inode));
++			if (ret) {
++				bch_err(c, "error removing dirent: %i", ret);
++				break;
++			}
++
++			ret = reattach_inode(trans, inode, snapshot);
++		}
++	}
++fsck_err:
++	if (ret)
++		bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
++	return ret;
++}
++
++/*
++ * Check for unreachable inodes, as well as loops in the directory structure:
++ * After check_dirents(), if an inode backpointer doesn't exist that means it's
++ * unreachable:
++ */
++noinline_for_stack
++static int check_directory_structure(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_inode_unpacked u;
++	pathbuf path = { 0, };
++	int ret;
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
++			   BTREE_ITER_INTENT|
++			   BTREE_ITER_PREFETCH|
++			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++		if (!bkey_is_inode(k.k))
++			continue;
++
++		ret = bch2_inode_unpack(k, &u);
++		if (ret) {
++			/* Should have been caught earlier in fsck: */
++			bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
++			break;
++		}
++
++		if (u.bi_flags & BCH_INODE_UNLINKED)
++			continue;
++
++		ret = check_path(&trans, &path, &u, iter.pos.snapshot);
++		if (ret)
++			break;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	darray_exit(&path);
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++struct nlink_table {
++	size_t		nr;
++	size_t		size;
++
++	struct nlink {
++		u64	inum;
++		u32	snapshot;
++		u32	count;
++	}		*d;
++};
++
++static int add_nlink(struct bch_fs *c, struct nlink_table *t,
++		     u64 inum, u32 snapshot)
++{
++	if (t->nr == t->size) {
++		size_t new_size = max_t(size_t, 128UL, t->size * 2);
++		void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
++		if (!d) {
++			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
++				new_size);
++			return -ENOMEM;
++		}
++
++		if (t->d)
++			memcpy(d, t->d, t->size * sizeof(t->d[0]));
++		kvfree(t->d);
++
++		t->d = d;
++		t->size = new_size;
++	}
++
++
++	t->d[t->nr++] = (struct nlink) {
++		.inum		= inum,
++		.snapshot	= snapshot,
++	};
++
++	return 0;
++}
++
++static int nlink_cmp(const void *_l, const void *_r)
++{
++	const struct nlink *l = _l;
++	const struct nlink *r = _r;
++
++	return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
++}
++
++static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
++		     struct nlink_table *links,
++		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
++{
++	struct nlink *link, key = {
++		.inum = inum, .snapshot = U32_MAX,
++	};
++
++	if (inum < range_start || inum >= range_end)
++		return;
++
++	link = __inline_bsearch(&key, links->d, links->nr,
++				sizeof(links->d[0]), nlink_cmp);
++	if (!link)
++		return;
++
++	while (link > links->d && link[0].inum == link[-1].inum)
++		--link;
++
++	for (; link < links->d + links->nr && link->inum == inum; link++)
++		if (ref_visible(c, s, snapshot, link->snapshot)) {
++			link->count++;
++			if (link->snapshot >= snapshot)
++				break;
++		}
++}
++
++noinline_for_stack
++static int check_nlinks_find_hardlinks(struct bch_fs *c,
++				       struct nlink_table *t,
++				       u64 start, u64 *end)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_inode_unpacked u;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
++			   POS(0, start),
++			   BTREE_ITER_INTENT|
++			   BTREE_ITER_PREFETCH|
++			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++		if (!bkey_is_inode(k.k))
++			continue;
++
++		/* Should never fail, checked by bch2_inode_invalid: */
++		BUG_ON(bch2_inode_unpack(k, &u));
++
++		/*
++		 * Backpointer and directory structure checks are sufficient for
++		 * directories, since they can't have hardlinks:
++		 */
++		if (S_ISDIR(le16_to_cpu(u.bi_mode)))
++			continue;
++
++		if (!u.bi_nlink)
++			continue;
++
++		ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
++		if (ret) {
++			*end = k.k->p.offset;
++			ret = 0;
++			break;
++		}
++
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
++
++	return ret;
++}
++
++noinline_for_stack
++static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
++				     u64 range_start, u64 range_end)
++{
++	struct btree_trans trans;
++	struct snapshots_seen s;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_s_c_dirent d;
++	int ret;
++
++	snapshots_seen_init(&s);
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
++			   BTREE_ITER_INTENT|
++			   BTREE_ITER_PREFETCH|
++			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++		ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
++		if (ret)
++			break;
++
++		switch (k.k->type) {
++		case KEY_TYPE_dirent:
++			d = bkey_s_c_to_dirent(k);
++
++			if (d.v->d_type != DT_DIR &&
++			    d.v->d_type != DT_SUBVOL)
++				inc_link(c, &s, links, range_start, range_end,
++					 le64_to_cpu(d.v->d_inum),
++					 bch2_snapshot_equiv(c, d.k->p.snapshot));
++			break;
++		}
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	if (ret)
++		bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
++
++	bch2_trans_exit(&trans);
++	snapshots_seen_exit(&s);
++	return ret;
++}
++
++static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
++				     struct bkey_s_c k,
++				     struct nlink_table *links,
++				     size_t *idx, u64 range_end)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_inode_unpacked u;
++	struct nlink *link = &links->d[*idx];
++	int ret = 0;
++
++	if (k.k->p.offset >= range_end)
++		return 1;
++
++	if (!bkey_is_inode(k.k))
++		return 0;
++
++	BUG_ON(bch2_inode_unpack(k, &u));
++
++	if (S_ISDIR(le16_to_cpu(u.bi_mode)))
++		return 0;
++
++	if (!u.bi_nlink)
++		return 0;
++
++	while ((cmp_int(link->inum, k.k->p.offset) ?:
++		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
++		BUG_ON(*idx == links->nr);
++		link = &links->d[++*idx];
++	}
++
++	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
++			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
++			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
++			bch2_inode_nlink_get(&u), link->count)) {
++		bch2_inode_nlink_set(&u, link->count);
++		ret = __write_inode(trans, &u, k.k->p.snapshot);
++	}
++fsck_err:
++	return ret;
++}
++
++noinline_for_stack
++static int check_nlinks_update_hardlinks(struct bch_fs *c,
++			       struct nlink_table *links,
++			       u64 range_start, u64 range_end)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	size_t idx = 0;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
++			POS(0, range_start),
++			BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
++
++	bch2_trans_exit(&trans);
++
++	if (ret < 0) {
++		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
++		return ret;
++	}
++
++	return 0;
++}
++
++noinline_for_stack
++static int check_nlinks(struct bch_fs *c)
++{
++	struct nlink_table links = { 0 };
++	u64 this_iter_range_start, next_iter_range_start = 0;
++	int ret = 0;
++
++	bch_verbose(c, "checking inode nlinks");
++
++	do {
++		this_iter_range_start = next_iter_range_start;
++		next_iter_range_start = U64_MAX;
++
++		ret = check_nlinks_find_hardlinks(c, &links,
++						  this_iter_range_start,
++						  &next_iter_range_start);
++
++		ret = check_nlinks_walk_dirents(c, &links,
++					  this_iter_range_start,
++					  next_iter_range_start);
++		if (ret)
++			break;
++
++		ret = check_nlinks_update_hardlinks(c, &links,
++					 this_iter_range_start,
++					 next_iter_range_start);
++		if (ret)
++			break;
++
++		links.nr = 0;
++	} while (next_iter_range_start != U64_MAX);
++
++	kvfree(links.d);
++
++	return ret;
++}
++
++static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
++			     struct bkey_s_c k)
++{
++	struct bkey_s_c_reflink_p p;
++	struct bkey_i_reflink_p *u;
++	int ret;
++
++	if (k.k->type != KEY_TYPE_reflink_p)
++		return 0;
++
++	p = bkey_s_c_to_reflink_p(k);
++
++	if (!p.v->front_pad && !p.v->back_pad)
++		return 0;
++
++	u = bch2_trans_kmalloc(trans, sizeof(*u));
++	ret = PTR_ERR_OR_ZERO(u);
++	if (ret)
++		return ret;
++
++	bkey_reassemble(&u->k_i, k);
++	u->v.front_pad	= 0;
++	u->v.back_pad	= 0;
++
++	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
++}
++
++noinline_for_stack
++static int fix_reflink_p(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
++		return 0;
++
++	bch_verbose(c, "fixing reflink_p keys");
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_extents, POS_MIN,
++			BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		fix_reflink_p_key(&trans, &iter, k));
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++/*
++ * Checks for inconsistencies that shouldn't happen, unless we have a bug.
++ * Doesn't fix them yet, mainly because they haven't yet been observed:
++ */
++int bch2_fsck_full(struct bch_fs *c)
++{
++	int ret;
++again:
++	ret =   bch2_fs_check_snapshots(c) ?:
++		bch2_fs_check_subvols(c) ?:
++		bch2_delete_dead_snapshots(c) ?:
++		check_inodes(c, true) ?:
++		check_extents(c) ?:
++		check_dirents(c) ?:
++		check_xattrs(c) ?:
++		check_root(c) ?:
++		check_directory_structure(c) ?:
++		check_nlinks(c) ?:
++		fix_reflink_p(c);
++
++	if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
++		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++		goto again;
++	}
++
++	return ret;
++}
++
++int bch2_fsck_walk_inodes_only(struct bch_fs *c)
++{
++	return  bch2_fs_check_snapshots(c) ?:
++		bch2_fs_check_subvols(c) ?:
++		bch2_delete_dead_snapshots(c) ?:
++		check_inodes(c, false);
++}
+diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
+new file mode 100644
+index 000000000000..264f2706b12d
+--- /dev/null
++++ b/fs/bcachefs/fsck.h
+@@ -0,0 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FSCK_H
++#define _BCACHEFS_FSCK_H
++
++int bch2_fsck_full(struct bch_fs *);
++int bch2_fsck_walk_inodes_only(struct bch_fs *);
++
++#endif /* _BCACHEFS_FSCK_H */
+diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
+new file mode 100644
+index 000000000000..083106006747
+--- /dev/null
++++ b/fs/bcachefs/inode.c
+@@ -0,0 +1,771 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "btree_key_cache.h"
++#include "bkey_methods.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "error.h"
++#include "extents.h"
++#include "extent_update.h"
++#include "inode.h"
++#include "str_hash.h"
++#include "subvolume.h"
++#include "varint.h"
++
++#include <linux/random.h>
++
++#include <asm/unaligned.h>
++
++const char * const bch2_inode_opts[] = {
++#define x(name, ...)	#name,
++	BCH_INODE_OPTS()
++#undef  x
++	NULL,
++};
++
++static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
++
++static int inode_decode_field(const u8 *in, const u8 *end,
++			      u64 out[2], unsigned *out_bits)
++{
++	__be64 be[2] = { 0, 0 };
++	unsigned bytes, shift;
++	u8 *p;
++
++	if (in >= end)
++		return -1;
++
++	if (!*in)
++		return -1;
++
++	/*
++	 * position of highest set bit indicates number of bytes:
++	 * shift = number of bits to remove in high byte:
++	 */
++	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
++	bytes	= byte_table[shift - 1];
++
++	if (in + bytes > end)
++		return -1;
++
++	p = (u8 *) be + 16 - bytes;
++	memcpy(p, in, bytes);
++	*p ^= (1 << 8) >> shift;
++
++	out[0] = be64_to_cpu(be[0]);
++	out[1] = be64_to_cpu(be[1]);
++	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
++
++	return bytes;
++}
++
++void bch2_inode_pack(struct bch_fs *c,
++		     struct bkey_inode_buf *packed,
++		     const struct bch_inode_unpacked *inode)
++{
++	struct bkey_i_inode_v2 *k = &packed->inode;
++	u8 *out = k->v.fields;
++	u8 *end = (void *) &packed[1];
++	u8 *last_nonzero_field = out;
++	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
++	unsigned bytes;
++	int ret;
++
++	bkey_inode_v2_init(&packed->inode.k_i);
++	packed->inode.k.p.offset	= inode->bi_inum;
++	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
++	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
++	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
++	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
++	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
++
++#define x(_name, _bits)							\
++	nr_fields++;							\
++									\
++	if (inode->_name) {						\
++		ret = bch2_varint_encode_fast(out, inode->_name);	\
++		out += ret;						\
++									\
++		if (_bits > 64)						\
++			*out++ = 0;					\
++									\
++		last_nonzero_field = out;				\
++		last_nonzero_fieldnr = nr_fields;			\
++	} else {							\
++		*out++ = 0;						\
++									\
++		if (_bits > 64)						\
++			*out++ = 0;					\
++	}
++
++	BCH_INODE_FIELDS()
++#undef  x
++	BUG_ON(out > end);
++
++	out = last_nonzero_field;
++	nr_fields = last_nonzero_fieldnr;
++
++	bytes = out - (u8 *) &packed->inode.v;
++	set_bkey_val_bytes(&packed->inode.k, bytes);
++	memset_u64s_tail(&packed->inode.v, 0, bytes);
++
++	SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
++
++	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
++		struct bch_inode_unpacked unpacked;
++
++		int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
++					   &unpacked);
++		BUG_ON(ret);
++		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
++		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
++		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
++
++#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
++			panic("unpacked %llu should be %llu",		\
++			      (u64) unpacked._name, (u64) inode->_name);
++		BCH_INODE_FIELDS()
++#undef  x
++	}
++}
++
++static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
++				struct bch_inode_unpacked *unpacked)
++{
++	const u8 *in = inode.v->fields;
++	const u8 *end = bkey_val_end(inode);
++	u64 field[2];
++	unsigned fieldnr = 0, field_bits;
++	int ret;
++
++#define x(_name, _bits)					\
++	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
++		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
++		memset((void *) unpacked + offset, 0,			\
++		       sizeof(*unpacked) - offset);			\
++		return 0;						\
++	}								\
++									\
++	ret = inode_decode_field(in, end, field, &field_bits);		\
++	if (ret < 0)							\
++		return ret;						\
++									\
++	if (field_bits > sizeof(unpacked->_name) * 8)			\
++		return -1;						\
++									\
++	unpacked->_name = field[1];					\
++	in += ret;
++
++	BCH_INODE_FIELDS()
++#undef  x
++
++	/* XXX: signal if there were more fields than expected? */
++	return 0;
++}
++
++static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
++				const u8 *in, const u8 *end,
++				unsigned nr_fields)
++{
++	unsigned fieldnr = 0;
++	int ret;
++	u64 v[2];
++
++#define x(_name, _bits)							\
++	if (fieldnr < nr_fields) {					\
++		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
++		if (ret < 0)						\
++			return ret;					\
++		in += ret;						\
++									\
++		if (_bits > 64) {					\
++			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
++			if (ret < 0)					\
++				return ret;				\
++			in += ret;					\
++		} else {						\
++			v[1] = 0;					\
++		}							\
++	} else {							\
++		v[0] = v[1] = 0;					\
++	}								\
++									\
++	unpacked->_name = v[0];						\
++	if (v[1] || v[0] != unpacked->_name)				\
++		return -1;						\
++	fieldnr++;
++
++	BCH_INODE_FIELDS()
++#undef  x
++
++	/* XXX: signal if there were more fields than expected? */
++	return 0;
++}
++
++int bch2_inode_unpack(struct bkey_s_c k,
++		      struct bch_inode_unpacked *unpacked)
++{
++	switch (k.k->type) {
++	case KEY_TYPE_inode: {
++		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
++
++		unpacked->bi_inum	= inode.k->p.offset;
++		unpacked->bi_journal_seq= 0;
++		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
++		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
++		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
++
++		if (INODE_NEW_VARINT(inode.v)) {
++			return bch2_inode_unpack_v2(unpacked, inode.v->fields,
++						    bkey_val_end(inode),
++						    INODE_NR_FIELDS(inode.v));
++		} else {
++			return bch2_inode_unpack_v1(inode, unpacked);
++		}
++		break;
++	}
++	case KEY_TYPE_inode_v2: {
++		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
++
++		unpacked->bi_inum	= inode.k->p.offset;
++		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
++		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
++		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
++		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
++
++		return bch2_inode_unpack_v2(unpacked, inode.v->fields,
++					    bkey_val_end(inode),
++					    INODEv2_NR_FIELDS(inode.v));
++	}
++	default:
++		BUG();
++	}
++}
++
++int bch2_inode_peek(struct btree_trans *trans,
++		    struct btree_iter *iter,
++		    struct bch_inode_unpacked *inode,
++		    subvol_inum inum, unsigned flags)
++{
++	struct bkey_s_c k;
++	u32 snapshot;
++	int ret;
++
++	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
++	if (ret)
++		return ret;
++
++	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
++			     SPOS(0, inum.inum, snapshot),
++			     flags|BTREE_ITER_CACHED);
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
++	if (ret)
++		goto err;
++
++	ret = bch2_inode_unpack(k, inode);
++	if (ret)
++		goto err;
++
++	return 0;
++err:
++	bch2_trans_iter_exit(trans, iter);
++	return ret;
++}
++
++int bch2_inode_write(struct btree_trans *trans,
++		     struct btree_iter *iter,
++		     struct bch_inode_unpacked *inode)
++{
++	struct bkey_inode_buf *inode_p;
++
++	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
++	if (IS_ERR(inode_p))
++		return PTR_ERR(inode_p);
++
++	bch2_inode_pack(trans->c, inode_p, inode);
++	inode_p->inode.k.p.snapshot = iter->snapshot;
++	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
++}
++
++static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
++{
++	struct bch_inode_unpacked unpacked;
++
++	if (k.k->p.inode) {
++		prt_printf(err, "nonzero k.p.inode");
++		return -EINVAL;
++	}
++
++	if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
++		prt_printf(err, "fs inode in blockdev range");
++		return -EINVAL;
++	}
++
++	if (bch2_inode_unpack(k, &unpacked)){
++		prt_printf(err, "invalid variable length fields");
++		return -EINVAL;
++	}
++
++	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
++		prt_printf(err, "invalid data checksum type (%u >= %u",
++			unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
++		return -EINVAL;
++	}
++
++	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
++		prt_printf(err, "invalid data checksum type (%u >= %u)",
++		       unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
++		return -EINVAL;
++	}
++
++	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
++	    unpacked.bi_nlink != 0) {
++		prt_printf(err, "flagged as unlinked but bi_nlink != 0");
++		return -EINVAL;
++	}
++
++	if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
++		prt_printf(err, "subvolume root but not a directory");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
++		       int rw, struct printbuf *err)
++{
++	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
++
++	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
++		prt_printf(err, "incorrect value size (%zu < %zu)",
++		       bkey_val_bytes(k.k), sizeof(*inode.v));
++		return -EINVAL;
++	}
++
++	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
++		prt_printf(err, "invalid str hash type (%llu >= %u)",
++		       INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
++		return -EINVAL;
++	}
++
++	return __bch2_inode_invalid(k, err);
++}
++
++int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			  int rw, struct printbuf *err)
++{
++	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
++
++	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
++		prt_printf(err, "incorrect value size (%zu < %zu)",
++		       bkey_val_bytes(k.k), sizeof(*inode.v));
++		return -EINVAL;
++	}
++
++	if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
++		prt_printf(err, "invalid str hash type (%llu >= %u)",
++		       INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
++		return -EINVAL;
++	}
++
++	return __bch2_inode_invalid(k, err);
++}
++
++static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
++{
++	prt_printf(out, "mode %o flags %x journal_seq %llu",
++	       inode->bi_mode, inode->bi_flags,
++	       inode->bi_journal_seq);
++
++#define x(_name, _bits)						\
++	prt_printf(out, " "#_name " %llu", (u64) inode->_name);
++	BCH_INODE_FIELDS()
++#undef  x
++}
++
++void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
++{
++	prt_printf(out, "inum: %llu ", inode->bi_inum);
++	__bch2_inode_unpacked_to_text(out, inode);
++}
++
++void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
++		       struct bkey_s_c k)
++{
++	struct bch_inode_unpacked inode;
++
++	if (bch2_inode_unpack(k, &inode)) {
++		prt_printf(out, "(unpack error)");
++		return;
++	}
++
++	__bch2_inode_unpacked_to_text(out, &inode);
++}
++
++int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
++				  int rw, struct printbuf *err)
++{
++	if (k.k->p.inode) {
++		prt_printf(err, "nonzero k.p.inode");
++		return -EINVAL;
++	}
++
++	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) {
++		prt_printf(err, "incorrect value size (%zu != %zu)",
++		       bkey_val_bytes(k.k), sizeof(struct bch_inode_generation));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
++				   struct bkey_s_c k)
++{
++	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
++
++	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
++}
++
++void bch2_inode_init_early(struct bch_fs *c,
++			   struct bch_inode_unpacked *inode_u)
++{
++	enum bch_str_hash_type str_hash =
++		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
++
++	memset(inode_u, 0, sizeof(*inode_u));
++
++	/* ick */
++	inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
++	get_random_bytes(&inode_u->bi_hash_seed,
++			 sizeof(inode_u->bi_hash_seed));
++}
++
++void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
++			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
++			  struct bch_inode_unpacked *parent)
++{
++	inode_u->bi_mode	= mode;
++	inode_u->bi_uid		= uid;
++	inode_u->bi_gid		= gid;
++	inode_u->bi_dev		= rdev;
++	inode_u->bi_atime	= now;
++	inode_u->bi_mtime	= now;
++	inode_u->bi_ctime	= now;
++	inode_u->bi_otime	= now;
++
++	if (parent && parent->bi_mode & S_ISGID) {
++		inode_u->bi_gid = parent->bi_gid;
++		if (S_ISDIR(mode))
++			inode_u->bi_mode |= S_ISGID;
++	}
++
++	if (parent) {
++#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
++		BCH_INODE_OPTS()
++#undef x
++	}
++}
++
++void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
++		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
++		     struct bch_inode_unpacked *parent)
++{
++	bch2_inode_init_early(c, inode_u);
++	bch2_inode_init_late(inode_u, bch2_current_time(c),
++			     uid, gid, mode, rdev, parent);
++}
++
++static inline u32 bkey_generation(struct bkey_s_c k)
++{
++	switch (k.k->type) {
++	case KEY_TYPE_inode:
++	case KEY_TYPE_inode_v2:
++		BUG();
++	case KEY_TYPE_inode_generation:
++		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
++	default:
++		return 0;
++	}
++}
++
++/*
++ * This just finds an empty slot:
++ */
++int bch2_inode_create(struct btree_trans *trans,
++		      struct btree_iter *iter,
++		      struct bch_inode_unpacked *inode_u,
++		      u32 snapshot, u64 cpu)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c k;
++	u64 min, max, start, pos, *hint;
++	int ret = 0;
++	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
++
++	if (c->opts.shard_inode_numbers) {
++		bits -= c->inode_shard_bits;
++
++		min = (cpu << bits);
++		max = (cpu << bits) | ~(ULLONG_MAX << bits);
++
++		min = max_t(u64, min, BLOCKDEV_INODE_MAX);
++		hint = c->unused_inode_hints + cpu;
++	} else {
++		min = BLOCKDEV_INODE_MAX;
++		max = ~(ULLONG_MAX << bits);
++		hint = c->unused_inode_hints;
++	}
++
++	start = READ_ONCE(*hint);
++
++	if (start >= max || start < min)
++		start = min;
++
++	pos = start;
++	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
++			     BTREE_ITER_ALL_SNAPSHOTS|
++			     BTREE_ITER_INTENT);
++again:
++	while ((k = bch2_btree_iter_peek(iter)).k &&
++	       !(ret = bkey_err(k)) &&
++	       bkey_cmp(k.k->p, POS(0, max)) < 0) {
++		while (pos < iter->pos.offset) {
++			if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
++				goto found_slot;
++
++			pos++;
++		}
++
++		if (k.k->p.snapshot == snapshot &&
++		    !bkey_is_inode(k.k) &&
++		    !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
++			bch2_btree_iter_advance(iter);
++			continue;
++		}
++
++		/*
++		 * We don't need to iterate over keys in every snapshot once
++		 * we've found just one:
++		 */
++		pos = iter->pos.offset + 1;
++		bch2_btree_iter_set_pos(iter, POS(0, pos));
++	}
++
++	while (!ret && pos < max) {
++		if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
++			goto found_slot;
++
++		pos++;
++	}
++
++	if (!ret && start == min)
++		ret = -ENOSPC;
++
++	if (ret) {
++		bch2_trans_iter_exit(trans, iter);
++		return ret;
++	}
++
++	/* Retry from start */
++	pos = start = min;
++	bch2_btree_iter_set_pos(iter, POS(0, pos));
++	goto again;
++found_slot:
++	bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (ret) {
++		bch2_trans_iter_exit(trans, iter);
++		return ret;
++	}
++
++	/* We may have raced while the iterator wasn't pointing at pos: */
++	if (bkey_is_inode(k.k) ||
++	    bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
++		goto again;
++
++	*hint			= k.k->p.offset;
++	inode_u->bi_inum	= k.k->p.offset;
++	inode_u->bi_generation	= bkey_generation(k);
++	return 0;
++}
++
++static int bch2_inode_delete_keys(struct btree_trans *trans,
++				  subvol_inum inum, enum btree_id id)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i delete;
++	u32 snapshot;
++	int ret = 0;
++
++	/*
++	 * We're never going to be deleting extents, no need to use an extent
++	 * iterator:
++	 */
++	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
++			     BTREE_ITER_NOT_EXTENTS|
++			     BTREE_ITER_INTENT);
++
++	while (1) {
++		bch2_trans_begin(trans);
++
++		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
++		if (ret)
++			goto err;
++
++		bch2_btree_iter_set_snapshot(&iter, snapshot);
++
++		k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (!k.k)
++			break;
++
++		bkey_init(&delete.k);
++		delete.k.p = iter.pos;
++
++		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
++		      bch2_trans_commit(trans, NULL, NULL,
++					BTREE_INSERT_NOFAIL);
++err:
++		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			break;
++	}
++
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
++{
++	struct btree_trans trans;
++	struct btree_iter iter = { NULL };
++	struct bkey_i_inode_generation delete;
++	struct bch_inode_unpacked inode_u;
++	struct bkey_s_c k;
++	u32 snapshot;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 1024);
++
++	/*
++	 * If this was a directory, there shouldn't be any real dirents left -
++	 * but there could be whiteouts (from hash collisions) that we should
++	 * delete:
++	 *
++	 * XXX: the dirent could ideally would delete whiteouts when they're no
++	 * longer needed
++	 */
++	ret   = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
++		bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
++		bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
++	if (ret)
++		goto err;
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
++			     SPOS(0, inum.inum, snapshot),
++			     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
++	k = bch2_btree_iter_peek_slot(&iter);
++
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (!bkey_is_inode(k.k)) {
++		bch2_fs_inconsistent(trans.c,
++				     "inode %llu not found when deleting",
++				     inum.inum);
++		ret = -EIO;
++		goto err;
++	}
++
++	bch2_inode_unpack(k, &inode_u);
++
++	/* Subvolume root? */
++	BUG_ON(inode_u.bi_subvol);
++
++	bkey_inode_generation_init(&delete.k_i);
++	delete.k.p = iter.pos;
++	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
++
++	ret   = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
++		bch2_trans_commit(&trans, NULL, NULL,
++				BTREE_INSERT_NOFAIL);
++err:
++	bch2_trans_iter_exit(&trans, &iter);
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
++				  subvol_inum inum,
++				  struct bch_inode_unpacked *inode)
++{
++	struct btree_iter iter;
++	int ret;
++
++	ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
++	if (!ret)
++		bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
++			    struct bch_inode_unpacked *inode)
++{
++	return bch2_trans_do(c, NULL, NULL, 0,
++		bch2_inode_find_by_inum_trans(&trans, inum, inode));
++}
++
++int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
++{
++	if (bi->bi_flags & BCH_INODE_UNLINKED)
++		bi->bi_flags &= ~BCH_INODE_UNLINKED;
++	else {
++		if (bi->bi_nlink == U32_MAX)
++			return -EINVAL;
++
++		bi->bi_nlink++;
++	}
++
++	return 0;
++}
++
++void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
++{
++	if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
++		bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
++					bi->bi_inum);
++		return;
++	}
++
++	if (bi->bi_flags & BCH_INODE_UNLINKED) {
++		bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
++		return;
++	}
++
++	if (bi->bi_nlink)
++		bi->bi_nlink--;
++	else
++		bi->bi_flags |= BCH_INODE_UNLINKED;
++}
+diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
+new file mode 100644
+index 000000000000..2ac2fc10513b
+--- /dev/null
++++ b/fs/bcachefs/inode.h
+@@ -0,0 +1,189 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_INODE_H
++#define _BCACHEFS_INODE_H
++
++#include "opts.h"
++
++extern const char * const bch2_inode_opts[];
++
++int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++
++#define bch2_bkey_ops_inode (struct bkey_ops) {		\
++	.key_invalid	= bch2_inode_invalid,		\
++	.val_to_text	= bch2_inode_to_text,		\
++	.trans_trigger	= bch2_trans_mark_inode,	\
++	.atomic_trigger	= bch2_mark_inode,		\
++}
++
++#define bch2_bkey_ops_inode_v2 (struct bkey_ops) {	\
++	.key_invalid	= bch2_inode_v2_invalid,	\
++	.val_to_text	= bch2_inode_to_text,		\
++	.trans_trigger	= bch2_trans_mark_inode,	\
++	.atomic_trigger	= bch2_mark_inode,		\
++}
++
++static inline bool bkey_is_inode(const struct bkey *k)
++{
++	return  k->type == KEY_TYPE_inode ||
++		k->type == KEY_TYPE_inode_v2;
++}
++
++int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
++				  int, struct printbuf *);
++void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++
++#define bch2_bkey_ops_inode_generation (struct bkey_ops) {	\
++	.key_invalid	= bch2_inode_generation_invalid,	\
++	.val_to_text	= bch2_inode_generation_to_text,	\
++}
++
++#if 0
++typedef struct {
++	u64			lo;
++	u32			hi;
++} __packed __aligned(4) u96;
++#endif
++typedef u64 u96;
++
++struct bch_inode_unpacked {
++	u64			bi_inum;
++	u64			bi_journal_seq;
++	__le64			bi_hash_seed;
++	u32			bi_flags;
++	u16			bi_mode;
++
++#define x(_name, _bits)	u##_bits _name;
++	BCH_INODE_FIELDS()
++#undef  x
++};
++
++struct bkey_inode_buf {
++	struct bkey_i_inode_v2	inode;
++
++#define x(_name, _bits)		+ 8 + _bits / 8
++	u8		_pad[0 + BCH_INODE_FIELDS()];
++#undef  x
++} __attribute__((packed, aligned(8)));
++
++void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
++		     const struct bch_inode_unpacked *);
++int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
++
++void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
++
++int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
++		    struct bch_inode_unpacked *, subvol_inum, unsigned);
++int bch2_inode_write(struct btree_trans *, struct btree_iter *,
++		     struct bch_inode_unpacked *);
++
++void bch2_inode_init_early(struct bch_fs *,
++			   struct bch_inode_unpacked *);
++void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
++			  uid_t, gid_t, umode_t, dev_t,
++			  struct bch_inode_unpacked *);
++void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
++		     uid_t, gid_t, umode_t, dev_t,
++		     struct bch_inode_unpacked *);
++
++int bch2_inode_create(struct btree_trans *, struct btree_iter *,
++		      struct bch_inode_unpacked *, u32, u64);
++
++int bch2_inode_rm(struct bch_fs *, subvol_inum);
++
++int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
++				  struct bch_inode_unpacked *);
++int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
++			    struct bch_inode_unpacked *);
++
++static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
++{
++	struct bch_io_opts ret = { 0 };
++
++#define x(_name, _bits)					\
++	if (inode->bi_##_name)						\
++		opt_set(ret, _name, inode->bi_##_name - 1);
++	BCH_INODE_OPTS()
++#undef x
++	return ret;
++}
++
++static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
++				      enum inode_opt_id id, u64 v)
++{
++	switch (id) {
++#define x(_name, ...)							\
++	case Inode_opt_##_name:						\
++		inode->bi_##_name = v;					\
++		break;
++	BCH_INODE_OPTS()
++#undef x
++	default:
++		BUG();
++	}
++}
++
++static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
++				     enum inode_opt_id id)
++{
++	switch (id) {
++#define x(_name, ...)							\
++	case Inode_opt_##_name:						\
++		return inode->bi_##_name;
++	BCH_INODE_OPTS()
++#undef x
++	default:
++		BUG();
++	}
++}
++
++static inline struct bch_io_opts
++io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode)
++{
++	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
++
++	bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode));
++	return opts;
++}
++
++static inline u8 mode_to_type(umode_t mode)
++{
++	return (mode >> 12) & 15;
++}
++
++static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
++{
++	return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
++}
++
++/* i_nlink: */
++
++static inline unsigned nlink_bias(umode_t mode)
++{
++	return S_ISDIR(mode) ? 2 : 1;
++}
++
++static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
++{
++	return bi->bi_flags & BCH_INODE_UNLINKED
++		  ? 0
++		  : bi->bi_nlink + nlink_bias(bi->bi_mode);
++}
++
++static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
++					unsigned nlink)
++{
++	if (nlink) {
++		bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
++		bi->bi_flags &= ~BCH_INODE_UNLINKED;
++	} else {
++		bi->bi_nlink = 0;
++		bi->bi_flags |= BCH_INODE_UNLINKED;
++	}
++}
++
++int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
++void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
++
++#endif /* _BCACHEFS_INODE_H */
+diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
+new file mode 100644
+index 000000000000..971f8ba00dbd
+--- /dev/null
++++ b/fs/bcachefs/io.c
+@@ -0,0 +1,2422 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Some low level IO code, and hacks for various block layer limitations
++ *
++ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
++ * Copyright 2012 Google, Inc.
++ */
++
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "alloc_foreground.h"
++#include "bkey_buf.h"
++#include "bset.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "checksum.h"
++#include "compress.h"
++#include "clock.h"
++#include "debug.h"
++#include "disk_groups.h"
++#include "ec.h"
++#include "error.h"
++#include "extent_update.h"
++#include "inode.h"
++#include "io.h"
++#include "journal.h"
++#include "keylist.h"
++#include "move.h"
++#include "rebalance.h"
++#include "subvolume.h"
++#include "super.h"
++#include "super-io.h"
++
++#include <linux/blkdev.h>
++#include <linux/random.h>
++#include <linux/sched/mm.h>
++
++#include <trace/events/bcachefs.h>
++
++const char *bch2_blk_status_to_str(blk_status_t status)
++{
++	if (status == BLK_STS_REMOVED)
++		return "device removed";
++	return blk_status_to_str(status);
++}
++
++static bool bch2_target_congested(struct bch_fs *c, u16 target)
++{
++	const struct bch_devs_mask *devs;
++	unsigned d, nr = 0, total = 0;
++	u64 now = local_clock(), last;
++	s64 congested;
++	struct bch_dev *ca;
++
++	if (!target)
++		return false;
++
++	rcu_read_lock();
++	devs = bch2_target_to_mask(c, target) ?:
++		&c->rw_devs[BCH_DATA_user];
++
++	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
++		ca = rcu_dereference(c->devs[d]);
++		if (!ca)
++			continue;
++
++		congested = atomic_read(&ca->congested);
++		last = READ_ONCE(ca->congested_last);
++		if (time_after64(now, last))
++			congested -= (now - last) >> 12;
++
++		total += max(congested, 0LL);
++		nr++;
++	}
++	rcu_read_unlock();
++
++	return bch2_rand_range(nr * CONGESTED_MAX) < total;
++}
++
++static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
++				       u64 now, int rw)
++{
++	u64 latency_capable =
++		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
++	/* ideally we'd be taking into account the device's variance here: */
++	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
++	s64 latency_over = io_latency - latency_threshold;
++
++	if (latency_threshold && latency_over > 0) {
++		/*
++		 * bump up congested by approximately latency_over * 4 /
++		 * latency_threshold - we don't need much accuracy here so don't
++		 * bother with the divide:
++		 */
++		if (atomic_read(&ca->congested) < CONGESTED_MAX)
++			atomic_add(latency_over >>
++				   max_t(int, ilog2(latency_threshold) - 2, 0),
++				   &ca->congested);
++
++		ca->congested_last = now;
++	} else if (atomic_read(&ca->congested) > 0) {
++		atomic_dec(&ca->congested);
++	}
++}
++
++void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
++{
++	atomic64_t *latency = &ca->cur_latency[rw];
++	u64 now = local_clock();
++	u64 io_latency = time_after64(now, submit_time)
++		? now - submit_time
++		: 0;
++	u64 old, new, v = atomic64_read(latency);
++
++	do {
++		old = v;
++
++		/*
++		 * If the io latency was reasonably close to the current
++		 * latency, skip doing the update and atomic operation - most of
++		 * the time:
++		 */
++		if (abs((int) (old - io_latency)) < (old >> 1) &&
++		    now & ~(~0U << 5))
++			break;
++
++		new = ewma_add(old, io_latency, 5);
++	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
++
++	bch2_congested_acct(ca, io_latency, now, rw);
++
++	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
++}
++
++/* Allocate, free from mempool: */
++
++void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
++{
++	struct bvec_iter_all iter;
++	struct bio_vec *bv;
++
++	bio_for_each_segment_all(bv, bio, iter)
++		if (bv->bv_page != ZERO_PAGE(0))
++			mempool_free(bv->bv_page, &c->bio_bounce_pages);
++	bio->bi_vcnt = 0;
++}
++
++static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
++{
++	struct page *page;
++
++	if (likely(!*using_mempool)) {
++		page = alloc_page(GFP_NOIO);
++		if (unlikely(!page)) {
++			mutex_lock(&c->bio_bounce_pages_lock);
++			*using_mempool = true;
++			goto pool_alloc;
++
++		}
++	} else {
++pool_alloc:
++		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
++	}
++
++	return page;
++}
++
++void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
++			       size_t size)
++{
++	bool using_mempool = false;
++
++	while (size) {
++		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
++		unsigned len = min_t(size_t, PAGE_SIZE, size);
++
++		BUG_ON(!bio_add_page(bio, page, len, 0));
++		size -= len;
++	}
++
++	if (using_mempool)
++		mutex_unlock(&c->bio_bounce_pages_lock);
++}
++
++/* Extent update path: */
++
++int bch2_sum_sector_overwrites(struct btree_trans *trans,
++			       struct btree_iter *extent_iter,
++			       struct bkey_i *new,
++			       bool *usage_increasing,
++			       s64 *i_sectors_delta,
++			       s64 *disk_sectors_delta)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c old;
++	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
++	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
++	int ret = 0;
++
++	*usage_increasing	= false;
++	*i_sectors_delta	= 0;
++	*disk_sectors_delta	= 0;
++
++	bch2_trans_copy_iter(&iter, extent_iter);
++
++	for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
++		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
++			max(bkey_start_offset(&new->k),
++			    bkey_start_offset(old.k));
++
++		*i_sectors_delta += sectors *
++			(bkey_extent_is_allocation(&new->k) -
++			 bkey_extent_is_allocation(old.k));
++
++		*disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
++		*disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
++			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
++			: 0;
++
++		if (!*usage_increasing &&
++		    (new->k.p.snapshot != old.k->p.snapshot ||
++		     new_replicas > bch2_bkey_replicas(c, old) ||
++		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
++			*usage_increasing = true;
++
++		if (bkey_cmp(old.k->p, new->k.p) >= 0)
++			break;
++	}
++
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_extent_update(struct btree_trans *trans,
++		       subvol_inum inum,
++		       struct btree_iter *iter,
++		       struct bkey_i *k,
++		       struct disk_reservation *disk_res,
++		       u64 *journal_seq,
++		       u64 new_i_size,
++		       s64 *i_sectors_delta_total,
++		       bool check_enospc)
++{
++	struct btree_iter inode_iter;
++	struct bch_inode_unpacked inode_u;
++	struct bpos next_pos;
++	bool usage_increasing;
++	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
++	int ret;
++
++	/*
++	 * This traverses us the iterator without changing iter->path->pos to
++	 * search_key() (which is pos + 1 for extents): we want there to be a
++	 * path already traversed at iter->pos because
++	 * bch2_trans_extent_update() will use it to attempt extent merging
++	 */
++	ret = __bch2_btree_iter_traverse(iter);
++	if (ret)
++		return ret;
++
++	ret = bch2_extent_trim_atomic(trans, iter, k);
++	if (ret)
++		return ret;
++
++	new_i_size = min(k->k.p.offset << 9, new_i_size);
++	next_pos = k->k.p;
++
++	ret = bch2_sum_sector_overwrites(trans, iter, k,
++			&usage_increasing,
++			&i_sectors_delta,
++			&disk_sectors_delta);
++	if (ret)
++		return ret;
++
++	if (disk_res &&
++	    disk_sectors_delta > (s64) disk_res->sectors) {
++		ret = bch2_disk_reservation_add(trans->c, disk_res,
++					disk_sectors_delta - disk_res->sectors,
++					!check_enospc || !usage_increasing
++					? BCH_DISK_RESERVATION_NOFAIL : 0);
++		if (ret)
++			return ret;
++	}
++
++	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
++			      BTREE_ITER_INTENT);
++	if (ret)
++		return ret;
++
++	if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
++	    new_i_size > inode_u.bi_size)
++		inode_u.bi_size = new_i_size;
++
++	inode_u.bi_sectors += i_sectors_delta;
++
++	ret =   bch2_trans_update(trans, iter, k, 0) ?:
++		bch2_inode_write(trans, &inode_iter, &inode_u) ?:
++		bch2_trans_commit(trans, disk_res, journal_seq,
++				BTREE_INSERT_NOCHECK_RW|
++				BTREE_INSERT_NOFAIL);
++	bch2_trans_iter_exit(trans, &inode_iter);
++
++	if (ret)
++		return ret;
++
++	if (i_sectors_delta_total)
++		*i_sectors_delta_total += i_sectors_delta;
++	bch2_btree_iter_set_pos(iter, next_pos);
++
++	return 0;
++}
++
++/*
++ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
++ */
++int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
++		   subvol_inum inum, u64 end,
++		   s64 *i_sectors_delta)
++{
++	struct bch_fs *c	= trans->c;
++	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
++	struct bpos end_pos = POS(inum.inum, end);
++	struct bkey_s_c k;
++	int ret = 0, ret2 = 0;
++	u32 snapshot;
++
++	while (!ret ||
++	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
++		struct disk_reservation disk_res =
++			bch2_disk_reservation_init(c, 0);
++		struct bkey_i delete;
++
++		if (ret)
++			ret2 = ret;
++
++		bch2_trans_begin(trans);
++
++		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
++		if (ret)
++			continue;
++
++		bch2_btree_iter_set_snapshot(iter, snapshot);
++
++		k = bch2_btree_iter_peek(iter);
++		if (bkey_cmp(iter->pos, end_pos) >= 0) {
++			bch2_btree_iter_set_pos(iter, end_pos);
++			break;
++		}
++
++		ret = bkey_err(k);
++		if (ret)
++			continue;
++
++		bkey_init(&delete.k);
++		delete.k.p = iter->pos;
++
++		/* create the biggest key we can */
++		bch2_key_resize(&delete.k, max_sectors);
++		bch2_cut_back(end_pos, &delete);
++
++		ret = bch2_extent_update(trans, inum, iter, &delete,
++				&disk_res, NULL,
++				0, i_sectors_delta, false);
++		bch2_disk_reservation_put(c, &disk_res);
++	}
++
++	return ret ?: ret2;
++}
++
++int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
++		s64 *i_sectors_delta)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	int ret;
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
++			     POS(inum.inum, start),
++			     BTREE_ITER_INTENT);
++
++	ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
++
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		ret = 0;
++
++	return ret;
++}
++
++int bch2_write_index_default(struct bch_write_op *op)
++{
++	struct bch_fs *c = op->c;
++	struct bkey_buf sk;
++	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
++	struct keylist *keys = &op->insert_keys;
++	struct bkey_i *k = bch2_keylist_front(keys);
++	struct btree_trans trans;
++	struct btree_iter iter;
++	subvol_inum inum = {
++		.subvol = op->subvol,
++		.inum	= k->k.p.inode,
++	};
++	int ret;
++
++	BUG_ON(!inum.subvol);
++
++	bch2_bkey_buf_init(&sk);
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
++
++	do {
++		bch2_trans_begin(&trans);
++
++		k = bch2_keylist_front(keys);
++		bch2_bkey_buf_copy(&sk, c, k);
++
++		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
++						  &sk.k->k.p.snapshot);
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			continue;
++		if (ret)
++			break;
++
++		bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
++				     bkey_start_pos(&sk.k->k),
++				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
++
++		ret = bch2_extent_update(&trans, inum, &iter, sk.k,
++					 &op->res, op_journal_seq(op),
++					 op->new_i_size, &op->i_sectors_delta,
++					 op->flags & BCH_WRITE_CHECK_ENOSPC);
++		bch2_trans_iter_exit(&trans, &iter);
++
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			continue;
++		if (ret)
++			break;
++
++		if (ec_ob)
++			bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
++
++		if (bkey_cmp(iter.pos, k->k.p) >= 0)
++			bch2_keylist_pop_front(&op->insert_keys);
++		else
++			bch2_cut_front(iter.pos, k);
++	} while (!bch2_keylist_empty(keys));
++
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&sk, c);
++
++	return ret;
++}
++
++/* Writes */
++
++void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
++			       enum bch_data_type type,
++			       const struct bkey_i *k)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
++	const struct bch_extent_ptr *ptr;
++	struct bch_write_bio *n;
++	struct bch_dev *ca;
++
++	BUG_ON(c->opts.nochanges);
++
++	bkey_for_each_ptr(ptrs, ptr) {
++		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
++		       !c->devs[ptr->dev]);
++
++		ca = bch_dev_bkey_exists(c, ptr->dev);
++
++		if (to_entry(ptr + 1) < ptrs.end) {
++			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
++						GFP_NOIO, &ca->replica_set));
++
++			n->bio.bi_end_io	= wbio->bio.bi_end_io;
++			n->bio.bi_private	= wbio->bio.bi_private;
++			n->parent		= wbio;
++			n->split		= true;
++			n->bounce		= false;
++			n->put_bio		= true;
++			n->bio.bi_opf		= wbio->bio.bi_opf;
++			bio_inc_remaining(&wbio->bio);
++		} else {
++			n = wbio;
++			n->split		= false;
++		}
++
++		n->c			= c;
++		n->dev			= ptr->dev;
++		n->have_ioref		= bch2_dev_get_ioref(ca,
++					type == BCH_DATA_btree ? READ : WRITE);
++		n->submit_time		= local_clock();
++		n->bio.bi_iter.bi_sector = ptr->offset;
++
++		if (likely(n->have_ioref)) {
++			this_cpu_add(ca->io_done->sectors[WRITE][type],
++				     bio_sectors(&n->bio));
++
++			bio_set_dev(&n->bio, ca->disk_sb.bdev);
++			submit_bio(&n->bio);
++		} else {
++			n->bio.bi_status	= BLK_STS_REMOVED;
++			bio_endio(&n->bio);
++		}
++	}
++}
++
++static void __bch2_write(struct closure *);
++
++static void bch2_write_done(struct closure *cl)
++{
++	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
++	struct bch_fs *c = op->c;
++
++	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
++		op->error = bch2_journal_error(&c->journal);
++
++	bch2_disk_reservation_put(c, &op->res);
++	percpu_ref_put(&c->writes);
++	bch2_keylist_free(&op->insert_keys, op->inline_keys);
++
++	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
++
++	if (op->end_io) {
++		EBUG_ON(cl->parent);
++		closure_debug_destroy(cl);
++		op->end_io(op);
++	} else {
++		closure_return(cl);
++	}
++}
++
++/**
++ * bch_write_index - after a write, update index to point to new data
++ */
++static void __bch2_write_index(struct bch_write_op *op)
++{
++	struct bch_fs *c = op->c;
++	struct keylist *keys = &op->insert_keys;
++	struct bch_extent_ptr *ptr;
++	struct bkey_i *src, *dst = keys->keys, *n, *k;
++	unsigned dev;
++	int ret;
++
++	for (src = keys->keys; src != keys->top; src = n) {
++		n = bkey_next(src);
++
++		if (bkey_extent_is_direct_data(&src->k)) {
++			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
++					    test_bit(ptr->dev, op->failed.d));
++
++			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
++				ret = -EIO;
++				goto err;
++			}
++		}
++
++		if (dst != src)
++			memmove_u64s_down(dst, src, src->u64s);
++		dst = bkey_next(dst);
++	}
++
++	keys->top = dst;
++
++	/*
++	 * probably not the ideal place to hook this in, but I don't
++	 * particularly want to plumb io_opts all the way through the btree
++	 * update stack right now
++	 */
++	for_each_keylist_key(keys, k) {
++		bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
++
++		if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
++			bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
++
++	}
++
++	if (!bch2_keylist_empty(keys)) {
++		u64 sectors_start = keylist_sectors(keys);
++		int ret = op->index_update_fn(op);
++
++		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
++		BUG_ON(keylist_sectors(keys) && !ret);
++
++		op->written += sectors_start - keylist_sectors(keys);
++
++		if (ret) {
++			bch_err_inum_ratelimited(c, op->pos.inode,
++				"write error %i from btree update", ret);
++			op->error = ret;
++		}
++	}
++out:
++	/* If some a bucket wasn't written, we can't erasure code it: */
++	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
++		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
++
++	bch2_open_buckets_put(c, &op->open_buckets);
++	return;
++err:
++	keys->top = keys->keys;
++	op->error = ret;
++	goto out;
++}
++
++static void bch2_write_index(struct closure *cl)
++{
++	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
++	struct bch_fs *c = op->c;
++
++	__bch2_write_index(op);
++
++	if (!(op->flags & BCH_WRITE_DONE)) {
++		continue_at(cl, __bch2_write, index_update_wq(op));
++	} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
++		bch2_journal_flush_seq_async(&c->journal,
++					     *op_journal_seq(op),
++					     cl);
++		continue_at(cl, bch2_write_done, index_update_wq(op));
++	} else {
++		continue_at_nobarrier(cl, bch2_write_done, NULL);
++	}
++}
++
++static void bch2_write_endio(struct bio *bio)
++{
++	struct closure *cl		= bio->bi_private;
++	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
++	struct bch_write_bio *wbio	= to_wbio(bio);
++	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
++	struct bch_fs *c		= wbio->c;
++	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
++
++	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
++				    op->pos.inode,
++				    op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
++				    "data write error: %s",
++			       bch2_blk_status_to_str(bio->bi_status)))
++		set_bit(wbio->dev, op->failed.d);
++
++	if (wbio->have_ioref) {
++		bch2_latency_acct(ca, wbio->submit_time, WRITE);
++		percpu_ref_put(&ca->io_ref);
++	}
++
++	if (wbio->bounce)
++		bch2_bio_free_pages_pool(c, bio);
++
++	if (wbio->put_bio)
++		bio_put(bio);
++
++	if (parent)
++		bio_endio(&parent->bio);
++	else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
++		closure_put(cl);
++	else
++		continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
++}
++
++static void init_append_extent(struct bch_write_op *op,
++			       struct write_point *wp,
++			       struct bversion version,
++			       struct bch_extent_crc_unpacked crc)
++{
++	struct bch_fs *c = op->c;
++	struct bkey_i_extent *e;
++
++	op->pos.offset += crc.uncompressed_size;
++
++	e = bkey_extent_init(op->insert_keys.top);
++	e->k.p		= op->pos;
++	e->k.size	= crc.uncompressed_size;
++	e->k.version	= version;
++
++	if (crc.csum_type ||
++	    crc.compression_type ||
++	    crc.nonce)
++		bch2_extent_crc_append(&e->k_i, crc);
++
++	bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size,
++				       op->flags & BCH_WRITE_CACHED);
++
++	bch2_keylist_push(&op->insert_keys);
++}
++
++static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
++					struct write_point *wp,
++					struct bio *src,
++					bool *page_alloc_failed,
++					void *buf)
++{
++	struct bch_write_bio *wbio;
++	struct bio *bio;
++	unsigned output_available =
++		min(wp->sectors_free << 9, src->bi_iter.bi_size);
++	unsigned pages = DIV_ROUND_UP(output_available +
++				      (buf
++				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
++				       : 0), PAGE_SIZE);
++
++	pages = min(pages, BIO_MAX_VECS);
++
++	bio = bio_alloc_bioset(NULL, pages, 0,
++			       GFP_NOIO, &c->bio_write);
++	wbio			= wbio_init(bio);
++	wbio->put_bio		= true;
++	/* copy WRITE_SYNC flag */
++	wbio->bio.bi_opf	= src->bi_opf;
++
++	if (buf) {
++		bch2_bio_map(bio, buf, output_available);
++		return bio;
++	}
++
++	wbio->bounce		= true;
++
++	/*
++	 * We can't use mempool for more than c->sb.encoded_extent_max
++	 * worth of pages, but we'd like to allocate more if we can:
++	 */
++	bch2_bio_alloc_pages_pool(c, bio,
++				  min_t(unsigned, output_available,
++					c->opts.encoded_extent_max));
++
++	if (bio->bi_iter.bi_size < output_available)
++		*page_alloc_failed =
++			bch2_bio_alloc_pages(bio,
++					     output_available -
++					     bio->bi_iter.bi_size,
++					     GFP_NOFS) != 0;
++
++	return bio;
++}
++
++static int bch2_write_rechecksum(struct bch_fs *c,
++				 struct bch_write_op *op,
++				 unsigned new_csum_type)
++{
++	struct bio *bio = &op->wbio.bio;
++	struct bch_extent_crc_unpacked new_crc;
++	int ret;
++
++	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
++
++	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
++	    bch2_csum_type_is_encryption(new_csum_type))
++		new_csum_type = op->crc.csum_type;
++
++	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
++				  NULL, &new_crc,
++				  op->crc.offset, op->crc.live_size,
++				  new_csum_type);
++	if (ret)
++		return ret;
++
++	bio_advance(bio, op->crc.offset << 9);
++	bio->bi_iter.bi_size = op->crc.live_size << 9;
++	op->crc = new_crc;
++	return 0;
++}
++
++static int bch2_write_decrypt(struct bch_write_op *op)
++{
++	struct bch_fs *c = op->c;
++	struct nonce nonce = extent_nonce(op->version, op->crc);
++	struct bch_csum csum;
++	int ret;
++
++	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
++		return 0;
++
++	/*
++	 * If we need to decrypt data in the write path, we'll no longer be able
++	 * to verify the existing checksum (poly1305 mac, in this case) after
++	 * it's decrypted - this is the last point we'll be able to reverify the
++	 * checksum:
++	 */
++	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
++	if (bch2_crc_cmp(op->crc.csum, csum))
++		return -EIO;
++
++	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
++	op->crc.csum_type = 0;
++	op->crc.csum = (struct bch_csum) { 0, 0 };
++	return ret;
++}
++
++static enum prep_encoded_ret {
++	PREP_ENCODED_OK,
++	PREP_ENCODED_ERR,
++	PREP_ENCODED_CHECKSUM_ERR,
++	PREP_ENCODED_DO_WRITE,
++} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
++{
++	struct bch_fs *c = op->c;
++	struct bio *bio = &op->wbio.bio;
++
++	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
++		return PREP_ENCODED_OK;
++
++	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
++
++	/* Can we just write the entire extent as is? */
++	if (op->crc.uncompressed_size == op->crc.live_size &&
++	    op->crc.compressed_size <= wp->sectors_free &&
++	    (op->crc.compression_type == op->compression_type ||
++	     op->incompressible)) {
++		if (!crc_is_compressed(op->crc) &&
++		    op->csum_type != op->crc.csum_type &&
++		    bch2_write_rechecksum(c, op, op->csum_type))
++			return PREP_ENCODED_CHECKSUM_ERR;
++
++		return PREP_ENCODED_DO_WRITE;
++	}
++
++	/*
++	 * If the data is compressed and we couldn't write the entire extent as
++	 * is, we have to decompress it:
++	 */
++	if (crc_is_compressed(op->crc)) {
++		struct bch_csum csum;
++
++		if (bch2_write_decrypt(op))
++			return PREP_ENCODED_CHECKSUM_ERR;
++
++		/* Last point we can still verify checksum: */
++		csum = bch2_checksum_bio(c, op->crc.csum_type,
++					 extent_nonce(op->version, op->crc),
++					 bio);
++		if (bch2_crc_cmp(op->crc.csum, csum))
++			return PREP_ENCODED_CHECKSUM_ERR;
++
++		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
++			return PREP_ENCODED_ERR;
++	}
++
++	/*
++	 * No longer have compressed data after this point - data might be
++	 * encrypted:
++	 */
++
++	/*
++	 * If the data is checksummed and we're only writing a subset,
++	 * rechecksum and adjust bio to point to currently live data:
++	 */
++	if ((op->crc.live_size != op->crc.uncompressed_size ||
++	     op->crc.csum_type != op->csum_type) &&
++	    bch2_write_rechecksum(c, op, op->csum_type))
++		return PREP_ENCODED_CHECKSUM_ERR;
++
++	/*
++	 * If we want to compress the data, it has to be decrypted:
++	 */
++	if ((op->compression_type ||
++	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
++	     bch2_csum_type_is_encryption(op->csum_type)) &&
++	    bch2_write_decrypt(op))
++		return PREP_ENCODED_CHECKSUM_ERR;
++
++	return PREP_ENCODED_OK;
++}
++
++static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
++			     struct bio **_dst)
++{
++	struct bch_fs *c = op->c;
++	struct bio *src = &op->wbio.bio, *dst = src;
++	struct bvec_iter saved_iter;
++	void *ec_buf;
++	unsigned total_output = 0, total_input = 0;
++	bool bounce = false;
++	bool page_alloc_failed = false;
++	int ret, more = 0;
++
++	BUG_ON(!bio_sectors(src));
++
++	ec_buf = bch2_writepoint_ec_buf(c, wp);
++
++	switch (bch2_write_prep_encoded_data(op, wp)) {
++	case PREP_ENCODED_OK:
++		break;
++	case PREP_ENCODED_ERR:
++		ret = -EIO;
++		goto err;
++	case PREP_ENCODED_CHECKSUM_ERR:
++		goto csum_err;
++	case PREP_ENCODED_DO_WRITE:
++		/* XXX look for bug here */
++		if (ec_buf) {
++			dst = bch2_write_bio_alloc(c, wp, src,
++						   &page_alloc_failed,
++						   ec_buf);
++			bio_copy_data(dst, src);
++			bounce = true;
++		}
++		init_append_extent(op, wp, op->version, op->crc);
++		goto do_write;
++	}
++
++	if (ec_buf ||
++	    op->compression_type ||
++	    (op->csum_type &&
++	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
++	    (bch2_csum_type_is_encryption(op->csum_type) &&
++	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
++		dst = bch2_write_bio_alloc(c, wp, src,
++					   &page_alloc_failed,
++					   ec_buf);
++		bounce = true;
++	}
++
++	saved_iter = dst->bi_iter;
++
++	do {
++		struct bch_extent_crc_unpacked crc =
++			(struct bch_extent_crc_unpacked) { 0 };
++		struct bversion version = op->version;
++		size_t dst_len, src_len;
++
++		if (page_alloc_failed &&
++		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
++		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
++			break;
++
++		BUG_ON(op->compression_type &&
++		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
++		       bch2_csum_type_is_encryption(op->crc.csum_type));
++		BUG_ON(op->compression_type && !bounce);
++
++		crc.compression_type = op->incompressible
++			? BCH_COMPRESSION_TYPE_incompressible
++			: op->compression_type
++			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
++					    op->compression_type)
++			: 0;
++		if (!crc_is_compressed(crc)) {
++			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
++			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
++
++			if (op->csum_type)
++				dst_len = min_t(unsigned, dst_len,
++						c->opts.encoded_extent_max);
++
++			if (bounce) {
++				swap(dst->bi_iter.bi_size, dst_len);
++				bio_copy_data(dst, src);
++				swap(dst->bi_iter.bi_size, dst_len);
++			}
++
++			src_len = dst_len;
++		}
++
++		BUG_ON(!src_len || !dst_len);
++
++		if (bch2_csum_type_is_encryption(op->csum_type)) {
++			if (bversion_zero(version)) {
++				version.lo = atomic64_inc_return(&c->key_version);
++			} else {
++				crc.nonce = op->nonce;
++				op->nonce += src_len >> 9;
++			}
++		}
++
++		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
++		    !crc_is_compressed(crc) &&
++		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
++		    bch2_csum_type_is_encryption(op->csum_type)) {
++			/*
++			 * Note: when we're using rechecksum(), we need to be
++			 * checksumming @src because it has all the data our
++			 * existing checksum covers - if we bounced (because we
++			 * were trying to compress), @dst will only have the
++			 * part of the data the new checksum will cover.
++			 *
++			 * But normally we want to be checksumming post bounce,
++			 * because part of the reason for bouncing is so the
++			 * data can't be modified (by userspace) while it's in
++			 * flight.
++			 */
++			if (bch2_rechecksum_bio(c, src, version, op->crc,
++					&crc, &op->crc,
++					src_len >> 9,
++					bio_sectors(src) - (src_len >> 9),
++					op->csum_type))
++				goto csum_err;
++		} else {
++			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
++			    bch2_rechecksum_bio(c, src, version, op->crc,
++					NULL, &op->crc,
++					src_len >> 9,
++					bio_sectors(src) - (src_len >> 9),
++					op->crc.csum_type))
++				goto csum_err;
++
++			crc.compressed_size	= dst_len >> 9;
++			crc.uncompressed_size	= src_len >> 9;
++			crc.live_size		= src_len >> 9;
++
++			swap(dst->bi_iter.bi_size, dst_len);
++			ret = bch2_encrypt_bio(c, op->csum_type,
++					       extent_nonce(version, crc), dst);
++			if (ret)
++				goto err;
++
++			crc.csum = bch2_checksum_bio(c, op->csum_type,
++					 extent_nonce(version, crc), dst);
++			crc.csum_type = op->csum_type;
++			swap(dst->bi_iter.bi_size, dst_len);
++		}
++
++		init_append_extent(op, wp, version, crc);
++
++		if (dst != src)
++			bio_advance(dst, dst_len);
++		bio_advance(src, src_len);
++		total_output	+= dst_len;
++		total_input	+= src_len;
++	} while (dst->bi_iter.bi_size &&
++		 src->bi_iter.bi_size &&
++		 wp->sectors_free &&
++		 !bch2_keylist_realloc(&op->insert_keys,
++				      op->inline_keys,
++				      ARRAY_SIZE(op->inline_keys),
++				      BKEY_EXTENT_U64s_MAX));
++
++	more = src->bi_iter.bi_size != 0;
++
++	dst->bi_iter = saved_iter;
++
++	if (dst == src && more) {
++		BUG_ON(total_output != total_input);
++
++		dst = bio_split(src, total_input >> 9,
++				GFP_NOIO, &c->bio_write);
++		wbio_init(dst)->put_bio	= true;
++		/* copy WRITE_SYNC flag */
++		dst->bi_opf		= src->bi_opf;
++	}
++
++	dst->bi_iter.bi_size = total_output;
++do_write:
++	*_dst = dst;
++	return more;
++csum_err:
++	bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
++	ret = -EIO;
++err:
++	if (to_wbio(dst)->bounce)
++		bch2_bio_free_pages_pool(c, dst);
++	if (to_wbio(dst)->put_bio)
++		bio_put(dst);
++
++	return ret;
++}
++
++static void __bch2_write(struct closure *cl)
++{
++	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
++	struct bch_fs *c = op->c;
++	struct write_point *wp;
++	struct bio *bio = NULL;
++	bool skip_put = true;
++	unsigned nofs_flags;
++	int ret;
++
++	nofs_flags = memalloc_nofs_save();
++again:
++	memset(&op->failed, 0, sizeof(op->failed));
++
++	do {
++		struct bkey_i *key_to_write;
++		unsigned key_to_write_offset = op->insert_keys.top_p -
++			op->insert_keys.keys_p;
++
++		/* +1 for possible cache device: */
++		if (op->open_buckets.nr + op->nr_replicas + 1 >
++		    ARRAY_SIZE(op->open_buckets.v))
++			goto flush_io;
++
++		if (bch2_keylist_realloc(&op->insert_keys,
++					op->inline_keys,
++					ARRAY_SIZE(op->inline_keys),
++					BKEY_EXTENT_U64s_MAX))
++			goto flush_io;
++
++		/*
++		 * The copygc thread is now global, which means it's no longer
++		 * freeing up space on specific disks, which means that
++		 * allocations for specific disks may hang arbitrarily long:
++		 */
++		wp = bch2_alloc_sectors_start(c,
++			op->target,
++			op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
++			op->write_point,
++			&op->devs_have,
++			op->nr_replicas,
++			op->nr_replicas_required,
++			op->alloc_reserve,
++			op->flags,
++			(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
++				      BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
++		EBUG_ON(!wp);
++
++		if (unlikely(IS_ERR(wp))) {
++			if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
++				ret = PTR_ERR(wp);
++				goto err;
++			}
++
++			goto flush_io;
++		}
++
++		/*
++		 * It's possible for the allocator to fail, put us on the
++		 * freelist waitlist, and then succeed in one of various retry
++		 * paths: if that happens, we need to disable the skip_put
++		 * optimization because otherwise there won't necessarily be a
++		 * barrier before we free the bch_write_op:
++		 */
++		if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
++			skip_put = false;
++
++		bch2_open_bucket_get(c, wp, &op->open_buckets);
++		ret = bch2_write_extent(op, wp, &bio);
++		bch2_alloc_sectors_done(c, wp);
++
++		if (ret < 0)
++			goto err;
++
++		if (ret) {
++			skip_put = false;
++		} else {
++			/*
++			 * for the skip_put optimization this has to be set
++			 * before we submit the bio:
++			 */
++			op->flags |= BCH_WRITE_DONE;
++		}
++
++		bio->bi_end_io	= bch2_write_endio;
++		bio->bi_private	= &op->cl;
++		bio->bi_opf |= REQ_OP_WRITE;
++
++		if (!skip_put)
++			closure_get(bio->bi_private);
++		else
++			op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
++
++		key_to_write = (void *) (op->insert_keys.keys_p +
++					 key_to_write_offset);
++
++		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
++					  key_to_write);
++	} while (ret);
++
++	if (!skip_put)
++		continue_at(cl, bch2_write_index, index_update_wq(op));
++out:
++	memalloc_nofs_restore(nofs_flags);
++	return;
++err:
++	op->error = ret;
++	op->flags |= BCH_WRITE_DONE;
++
++	continue_at(cl, bch2_write_index, index_update_wq(op));
++	goto out;
++flush_io:
++	/*
++	 * If the write can't all be submitted at once, we generally want to
++	 * block synchronously as that signals backpressure to the caller.
++	 *
++	 * However, if we're running out of a workqueue, we can't block here
++	 * because we'll be blocking other work items from completing:
++	 */
++	if (current->flags & PF_WQ_WORKER) {
++		continue_at(cl, bch2_write_index, index_update_wq(op));
++		goto out;
++	}
++
++	closure_sync(cl);
++
++	if (!bch2_keylist_empty(&op->insert_keys)) {
++		__bch2_write_index(op);
++
++		if (op->error) {
++			op->flags |= BCH_WRITE_DONE;
++			continue_at_nobarrier(cl, bch2_write_done, NULL);
++			goto out;
++		}
++	}
++
++	goto again;
++}
++
++static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
++{
++	struct closure *cl = &op->cl;
++	struct bio *bio = &op->wbio.bio;
++	struct bvec_iter iter;
++	struct bkey_i_inline_data *id;
++	unsigned sectors;
++	int ret;
++
++	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
++
++	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
++				   ARRAY_SIZE(op->inline_keys),
++				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
++	if (ret) {
++		op->error = ret;
++		goto err;
++	}
++
++	sectors = bio_sectors(bio);
++	op->pos.offset += sectors;
++
++	id = bkey_inline_data_init(op->insert_keys.top);
++	id->k.p		= op->pos;
++	id->k.version	= op->version;
++	id->k.size	= sectors;
++
++	iter = bio->bi_iter;
++	iter.bi_size = data_len;
++	memcpy_from_bio(id->v.data, bio, iter);
++
++	while (data_len & 7)
++		id->v.data[data_len++] = '\0';
++	set_bkey_val_bytes(&id->k, data_len);
++	bch2_keylist_push(&op->insert_keys);
++
++	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
++	op->flags |= BCH_WRITE_DONE;
++
++	continue_at_nobarrier(cl, bch2_write_index, NULL);
++	return;
++err:
++	bch2_write_done(&op->cl);
++}
++
++/**
++ * bch_write - handle a write to a cache device or flash only volume
++ *
++ * This is the starting point for any data to end up in a cache device; it could
++ * be from a normal write, or a writeback write, or a write to a flash only
++ * volume - it's also used by the moving garbage collector to compact data in
++ * mostly empty buckets.
++ *
++ * It first writes the data to the cache, creating a list of keys to be inserted
++ * (if the data won't fit in a single open bucket, there will be multiple keys);
++ * after the data is written it calls bch_journal, and after the keys have been
++ * added to the next journal write they're inserted into the btree.
++ *
++ * If op->discard is true, instead of inserting the data it invalidates the
++ * region of the cache represented by op->bio and op->inode.
++ */
++void bch2_write(struct closure *cl)
++{
++	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
++	struct bio *bio = &op->wbio.bio;
++	struct bch_fs *c = op->c;
++	unsigned data_len;
++
++	BUG_ON(!op->nr_replicas);
++	BUG_ON(!op->write_point.v);
++	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
++
++	op->start_time = local_clock();
++	bch2_keylist_init(&op->insert_keys, op->inline_keys);
++	wbio_init(bio)->put_bio = false;
++
++	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
++		bch_err_inum_ratelimited(c, op->pos.inode,
++					 "misaligned write");
++		op->error = -EIO;
++		goto err;
++	}
++
++	if (c->opts.nochanges ||
++	    !percpu_ref_tryget_live(&c->writes)) {
++		op->error = -EROFS;
++		goto err;
++	}
++
++	this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
++	bch2_increment_clock(c, bio_sectors(bio), WRITE);
++
++	data_len = min_t(u64, bio->bi_iter.bi_size,
++			 op->new_i_size - (op->pos.offset << 9));
++
++	if (c->opts.inline_data &&
++	    data_len <= min(block_bytes(c) / 2, 1024U)) {
++		bch2_write_data_inline(op, data_len);
++		return;
++	}
++
++	continue_at_nobarrier(cl, __bch2_write, NULL);
++	return;
++err:
++	bch2_disk_reservation_put(c, &op->res);
++
++	if (op->end_io) {
++		EBUG_ON(cl->parent);
++		closure_debug_destroy(cl);
++		op->end_io(op);
++	} else {
++		closure_return(cl);
++	}
++}
++
++/* Cache promotion on read */
++
++struct promote_op {
++	struct closure		cl;
++	struct rcu_head		rcu;
++	u64			start_time;
++
++	struct rhash_head	hash;
++	struct bpos		pos;
++
++	struct data_update	write;
++	struct bio_vec		bi_inline_vecs[0]; /* must be last */
++};
++
++static const struct rhashtable_params bch_promote_params = {
++	.head_offset	= offsetof(struct promote_op, hash),
++	.key_offset	= offsetof(struct promote_op, pos),
++	.key_len	= sizeof(struct bpos),
++};
++
++static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
++				  struct bpos pos,
++				  struct bch_io_opts opts,
++				  unsigned flags)
++{
++	if (!(flags & BCH_READ_MAY_PROMOTE))
++		return false;
++
++	if (!opts.promote_target)
++		return false;
++
++	if (bch2_bkey_has_target(c, k, opts.promote_target))
++		return false;
++
++	if (bch2_target_congested(c, opts.promote_target)) {
++		/* XXX trace this */
++		return false;
++	}
++
++	if (rhashtable_lookup_fast(&c->promote_table, &pos,
++				   bch_promote_params))
++		return false;
++
++	return true;
++}
++
++static void promote_free(struct bch_fs *c, struct promote_op *op)
++{
++	int ret;
++
++	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
++				     bch_promote_params);
++	BUG_ON(ret);
++	percpu_ref_put(&c->writes);
++	kfree_rcu(op, rcu);
++}
++
++static void promote_done(struct closure *cl)
++{
++	struct promote_op *op =
++		container_of(cl, struct promote_op, cl);
++	struct bch_fs *c = op->write.op.c;
++
++	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
++			       op->start_time);
++
++	bch2_data_update_exit(&op->write);
++	promote_free(c, op);
++}
++
++static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
++{
++	struct closure *cl = &op->cl;
++	struct bio *bio = &op->write.op.wbio.bio;
++
++	trace_promote(&rbio->bio);
++
++	/* we now own pages: */
++	BUG_ON(!rbio->bounce);
++	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
++
++	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
++	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
++	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
++
++	closure_init(cl, NULL);
++	bch2_data_update_read_done(&op->write, rbio->pick.crc, cl);
++	closure_return_with_destructor(cl, promote_done);
++}
++
++static struct promote_op *__promote_alloc(struct bch_fs *c,
++					  enum btree_id btree_id,
++					  struct bkey_s_c k,
++					  struct bpos pos,
++					  struct extent_ptr_decoded *pick,
++					  struct bch_io_opts opts,
++					  unsigned sectors,
++					  struct bch_read_bio **rbio)
++{
++	struct promote_op *op = NULL;
++	struct bio *bio;
++	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
++	int ret;
++
++	if (!percpu_ref_tryget_live(&c->writes))
++		return NULL;
++
++	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
++	if (!op)
++		goto err;
++
++	op->start_time = local_clock();
++	op->pos = pos;
++
++	/*
++	 * We don't use the mempool here because extents that aren't
++	 * checksummed or compressed can be too big for the mempool:
++	 */
++	*rbio = kzalloc(sizeof(struct bch_read_bio) +
++			sizeof(struct bio_vec) * pages,
++			GFP_NOIO);
++	if (!*rbio)
++		goto err;
++
++	rbio_init(&(*rbio)->bio, opts);
++	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
++
++	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
++				 GFP_NOIO))
++		goto err;
++
++	(*rbio)->bounce		= true;
++	(*rbio)->split		= true;
++	(*rbio)->kmalloc	= true;
++
++	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
++					  bch_promote_params))
++		goto err;
++
++	bio = &op->write.op.wbio.bio;
++	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
++
++	ret = bch2_data_update_init(c, &op->write,
++			writepoint_hashed((unsigned long) current),
++			opts,
++			(struct data_update_opts) {
++				.target		= opts.promote_target,
++				.extra_replicas	= 1,
++				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
++			},
++			btree_id, k);
++	BUG_ON(ret);
++
++	return op;
++err:
++	if (*rbio)
++		bio_free_pages(&(*rbio)->bio);
++	kfree(*rbio);
++	*rbio = NULL;
++	kfree(op);
++	percpu_ref_put(&c->writes);
++	return NULL;
++}
++
++noinline
++static struct promote_op *promote_alloc(struct bch_fs *c,
++					       struct bvec_iter iter,
++					       struct bkey_s_c k,
++					       struct extent_ptr_decoded *pick,
++					       struct bch_io_opts opts,
++					       unsigned flags,
++					       struct bch_read_bio **rbio,
++					       bool *bounce,
++					       bool *read_full)
++{
++	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
++	/* data might have to be decompressed in the write path: */
++	unsigned sectors = promote_full
++		? max(pick->crc.compressed_size, pick->crc.live_size)
++		: bvec_iter_sectors(iter);
++	struct bpos pos = promote_full
++		? bkey_start_pos(k.k)
++		: POS(k.k->p.inode, iter.bi_sector);
++	struct promote_op *promote;
++
++	if (!should_promote(c, k, pos, opts, flags))
++		return NULL;
++
++	promote = __promote_alloc(c,
++				  k.k->type == KEY_TYPE_reflink_v
++				  ? BTREE_ID_reflink
++				  : BTREE_ID_extents,
++				  k, pos, pick, opts, sectors, rbio);
++	if (!promote)
++		return NULL;
++
++	*bounce		= true;
++	*read_full	= promote_full;
++	return promote;
++}
++
++/* Read */
++
++#define READ_RETRY_AVOID	1
++#define READ_RETRY		2
++#define READ_ERR		3
++
++enum rbio_context {
++	RBIO_CONTEXT_NULL,
++	RBIO_CONTEXT_HIGHPRI,
++	RBIO_CONTEXT_UNBOUND,
++};
++
++static inline struct bch_read_bio *
++bch2_rbio_parent(struct bch_read_bio *rbio)
++{
++	return rbio->split ? rbio->parent : rbio;
++}
++
++__always_inline
++static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
++			   enum rbio_context context,
++			   struct workqueue_struct *wq)
++{
++	if (context <= rbio->context) {
++		fn(&rbio->work);
++	} else {
++		rbio->work.func		= fn;
++		rbio->context		= context;
++		queue_work(wq, &rbio->work);
++	}
++}
++
++static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
++{
++	BUG_ON(rbio->bounce && !rbio->split);
++
++	if (rbio->promote)
++		promote_free(rbio->c, rbio->promote);
++	rbio->promote = NULL;
++
++	if (rbio->bounce)
++		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
++
++	if (rbio->split) {
++		struct bch_read_bio *parent = rbio->parent;
++
++		if (rbio->kmalloc)
++			kfree(rbio);
++		else
++			bio_put(&rbio->bio);
++
++		rbio = parent;
++	}
++
++	return rbio;
++}
++
++/*
++ * Only called on a top level bch_read_bio to complete an entire read request,
++ * not a split:
++ */
++static void bch2_rbio_done(struct bch_read_bio *rbio)
++{
++	if (rbio->start_time)
++		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
++				       rbio->start_time);
++	bio_endio(&rbio->bio);
++}
++
++static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
++				     struct bvec_iter bvec_iter,
++				     struct bch_io_failures *failed,
++				     unsigned flags)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_buf sk;
++	struct bkey_s_c k;
++	int ret;
++
++	flags &= ~BCH_READ_LAST_FRAGMENT;
++	flags |= BCH_READ_MUST_CLONE;
++
++	bch2_bkey_buf_init(&sk);
++	bch2_trans_init(&trans, c, 0, 0);
++
++	bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
++			     rbio->read_pos, BTREE_ITER_SLOTS);
++retry:
++	rbio->bio.bi_status = 0;
++
++	k = bch2_btree_iter_peek_slot(&iter);
++	if (bkey_err(k))
++		goto err;
++
++	bch2_bkey_buf_reassemble(&sk, c, k);
++	k = bkey_i_to_s_c(sk.k);
++	bch2_trans_unlock(&trans);
++
++	if (!bch2_bkey_matches_ptr(c, k,
++				   rbio->pick.ptr,
++				   rbio->data_pos.offset -
++				   rbio->pick.crc.offset)) {
++		/* extent we wanted to read no longer exists: */
++		rbio->hole = true;
++		goto out;
++	}
++
++	ret = __bch2_read_extent(&trans, rbio, bvec_iter,
++				 rbio->read_pos,
++				 rbio->data_btree,
++				 k, 0, failed, flags);
++	if (ret == READ_RETRY)
++		goto retry;
++	if (ret)
++		goto err;
++out:
++	bch2_rbio_done(rbio);
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&sk, c);
++	return;
++err:
++	rbio->bio.bi_status = BLK_STS_IOERR;
++	goto out;
++}
++
++static void bch2_rbio_retry(struct work_struct *work)
++{
++	struct bch_read_bio *rbio =
++		container_of(work, struct bch_read_bio, work);
++	struct bch_fs *c	= rbio->c;
++	struct bvec_iter iter	= rbio->bvec_iter;
++	unsigned flags		= rbio->flags;
++	subvol_inum inum = {
++		.subvol = rbio->subvol,
++		.inum	= rbio->read_pos.inode,
++	};
++	struct bch_io_failures failed = { .nr = 0 };
++
++	trace_read_retry(&rbio->bio);
++
++	if (rbio->retry == READ_RETRY_AVOID)
++		bch2_mark_io_failure(&failed, &rbio->pick);
++
++	rbio->bio.bi_status = 0;
++
++	rbio = bch2_rbio_free(rbio);
++
++	flags |= BCH_READ_IN_RETRY;
++	flags &= ~BCH_READ_MAY_PROMOTE;
++
++	if (flags & BCH_READ_NODECODE) {
++		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
++	} else {
++		flags &= ~BCH_READ_LAST_FRAGMENT;
++		flags |= BCH_READ_MUST_CLONE;
++
++		__bch2_read(c, rbio, iter, inum, &failed, flags);
++	}
++}
++
++static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
++			    blk_status_t error)
++{
++	rbio->retry = retry;
++
++	if (rbio->flags & BCH_READ_IN_RETRY)
++		return;
++
++	if (retry == READ_ERR) {
++		rbio = bch2_rbio_free(rbio);
++
++		rbio->bio.bi_status = error;
++		bch2_rbio_done(rbio);
++	} else {
++		bch2_rbio_punt(rbio, bch2_rbio_retry,
++			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
++	}
++}
++
++static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
++				   struct bch_read_bio *rbio)
++{
++	struct bch_fs *c = rbio->c;
++	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
++	struct bch_extent_crc_unpacked new_crc;
++	struct btree_iter iter;
++	struct bkey_i *new;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	if (crc_is_compressed(rbio->pick.crc))
++		return 0;
++
++	bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
++			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(&iter);
++	if ((ret = bkey_err(k)))
++		goto out;
++
++	if (bversion_cmp(k.k->version, rbio->version) ||
++	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
++		goto out;
++
++	/* Extent was merged? */
++	if (bkey_start_offset(k.k) < data_offset ||
++	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
++		goto out;
++
++	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
++			rbio->pick.crc, NULL, &new_crc,
++			bkey_start_offset(k.k) - data_offset, k.k->size,
++			rbio->pick.crc.csum_type)) {
++		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
++		ret = 0;
++		goto out;
++	}
++
++	/*
++	 * going to be temporarily appending another checksum entry:
++	 */
++	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
++				 sizeof(struct bch_extent_crc128));
++	if ((ret = PTR_ERR_OR_ZERO(new)))
++		goto out;
++
++	bkey_reassemble(new, k);
++
++	if (!bch2_bkey_narrow_crcs(new, new_crc))
++		goto out;
++
++	ret = bch2_trans_update(trans, &iter, new,
++				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
++{
++	bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
++		      __bch2_rbio_narrow_crcs(&trans, rbio));
++}
++
++/* Inner part that may run in process context */
++static void __bch2_read_endio(struct work_struct *work)
++{
++	struct bch_read_bio *rbio =
++		container_of(work, struct bch_read_bio, work);
++	struct bch_fs *c	= rbio->c;
++	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
++	struct bio *src		= &rbio->bio;
++	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
++	struct bvec_iter dst_iter = rbio->bvec_iter;
++	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
++	struct nonce nonce = extent_nonce(rbio->version, crc);
++	unsigned nofs_flags;
++	struct bch_csum csum;
++	int ret;
++
++	nofs_flags = memalloc_nofs_save();
++
++	/* Reset iterator for checksumming and copying bounced data: */
++	if (rbio->bounce) {
++		src->bi_iter.bi_size		= crc.compressed_size << 9;
++		src->bi_iter.bi_idx		= 0;
++		src->bi_iter.bi_bvec_done	= 0;
++	} else {
++		src->bi_iter			= rbio->bvec_iter;
++	}
++
++	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
++	if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
++		goto csum_err;
++
++	/*
++	 * XXX
++	 * We need to rework the narrow_crcs path to deliver the read completion
++	 * first, and then punt to a different workqueue, otherwise we're
++	 * holding up reads while doing btree updates which is bad for memory
++	 * reclaim.
++	 */
++	if (unlikely(rbio->narrow_crcs))
++		bch2_rbio_narrow_crcs(rbio);
++
++	if (rbio->flags & BCH_READ_NODECODE)
++		goto nodecode;
++
++	/* Adjust crc to point to subset of data we want: */
++	crc.offset     += rbio->offset_into_extent;
++	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
++
++	if (crc_is_compressed(crc)) {
++		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
++		if (ret)
++			goto decrypt_err;
++
++		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
++			goto decompression_err;
++	} else {
++		/* don't need to decrypt the entire bio: */
++		nonce = nonce_add(nonce, crc.offset << 9);
++		bio_advance(src, crc.offset << 9);
++
++		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
++		src->bi_iter.bi_size = dst_iter.bi_size;
++
++		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
++		if (ret)
++			goto decrypt_err;
++
++		if (rbio->bounce) {
++			struct bvec_iter src_iter = src->bi_iter;
++			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
++		}
++	}
++
++	if (rbio->promote) {
++		/*
++		 * Re encrypt data we decrypted, so it's consistent with
++		 * rbio->crc:
++		 */
++		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
++		if (ret)
++			goto decrypt_err;
++
++		promote_start(rbio->promote, rbio);
++		rbio->promote = NULL;
++	}
++nodecode:
++	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
++		rbio = bch2_rbio_free(rbio);
++		bch2_rbio_done(rbio);
++	}
++out:
++	memalloc_nofs_restore(nofs_flags);
++	return;
++csum_err:
++	/*
++	 * Checksum error: if the bio wasn't bounced, we may have been
++	 * reading into buffers owned by userspace (that userspace can
++	 * scribble over) - retry the read, bouncing it this time:
++	 */
++	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
++		rbio->flags |= BCH_READ_MUST_BOUNCE;
++		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
++		goto out;
++	}
++
++	bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
++		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
++		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
++		csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
++	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
++	goto out;
++decompression_err:
++	bch_err_inum_ratelimited(c, rbio->read_pos.inode,
++				 "decompression error");
++	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
++	goto out;
++decrypt_err:
++	bch_err_inum_ratelimited(c, rbio->read_pos.inode,
++				 "decrypt error");
++	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
++	goto out;
++}
++
++static void bch2_read_endio(struct bio *bio)
++{
++	struct bch_read_bio *rbio =
++		container_of(bio, struct bch_read_bio, bio);
++	struct bch_fs *c	= rbio->c;
++	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
++	struct workqueue_struct *wq = NULL;
++	enum rbio_context context = RBIO_CONTEXT_NULL;
++
++	if (rbio->have_ioref) {
++		bch2_latency_acct(ca, rbio->submit_time, READ);
++		percpu_ref_put(&ca->io_ref);
++	}
++
++	if (!rbio->split)
++		rbio->bio.bi_end_io = rbio->end_io;
++
++	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
++				    rbio->read_pos.inode,
++				    rbio->read_pos.offset,
++				    "data read error: %s",
++			       bch2_blk_status_to_str(bio->bi_status))) {
++		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
++		return;
++	}
++
++	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
++	    ptr_stale(ca, &rbio->pick.ptr)) {
++		atomic_long_inc(&c->read_realloc_races);
++
++		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
++			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
++		else
++			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
++		return;
++	}
++
++	if (rbio->narrow_crcs ||
++	    crc_is_compressed(rbio->pick.crc) ||
++	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
++		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
++	else if (rbio->pick.crc.csum_type)
++		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
++
++	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
++}
++
++int __bch2_read_indirect_extent(struct btree_trans *trans,
++				unsigned *offset_into_extent,
++				struct bkey_buf *orig_k)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u64 reflink_offset;
++	int ret;
++
++	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
++		*offset_into_extent;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
++			     POS(0, reflink_offset),
++			     BTREE_ITER_SLOTS);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_reflink_v &&
++	    k.k->type != KEY_TYPE_indirect_inline_data) {
++		bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
++			"%llu len %u points to nonexistent indirect extent %llu",
++			orig_k->k->k.p.offset,
++			orig_k->k->k.size,
++			reflink_offset);
++		bch2_inconsistent_error(trans->c);
++		ret = -EIO;
++		goto err;
++	}
++
++	*offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
++	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
++						   struct bkey_s_c k,
++						   struct bch_extent_ptr ptr)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
++	struct btree_iter iter;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
++			     PTR_BUCKET_POS(c, &ptr),
++			     BTREE_ITER_CACHED);
++
++	prt_printf(&buf, "Attempting to read from stale dirty pointer:");
++	printbuf_indent_add(&buf, 2);
++	prt_newline(&buf);
++
++	bch2_bkey_val_to_text(&buf, c, k);
++	prt_newline(&buf);
++
++	prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
++
++	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
++	if (!ret) {
++		prt_newline(&buf);
++		bch2_bkey_val_to_text(&buf, c, k);
++	}
++
++	bch2_fs_inconsistent(c, "%s", buf.buf);
++
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
++}
++
++int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
++		       struct bvec_iter iter, struct bpos read_pos,
++		       enum btree_id data_btree, struct bkey_s_c k,
++		       unsigned offset_into_extent,
++		       struct bch_io_failures *failed, unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct extent_ptr_decoded pick;
++	struct bch_read_bio *rbio = NULL;
++	struct bch_dev *ca = NULL;
++	struct promote_op *promote = NULL;
++	bool bounce = false, read_full = false, narrow_crcs = false;
++	struct bpos data_pos = bkey_start_pos(k.k);
++	int pick_ret;
++
++	if (bkey_extent_is_inline_data(k.k)) {
++		unsigned bytes = min_t(unsigned, iter.bi_size,
++				       bkey_inline_data_bytes(k.k));
++
++		swap(iter.bi_size, bytes);
++		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
++		swap(iter.bi_size, bytes);
++		bio_advance_iter(&orig->bio, &iter, bytes);
++		zero_fill_bio_iter(&orig->bio, iter);
++		goto out_read_done;
++	}
++retry_pick:
++	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
++
++	/* hole or reservation - just zero fill: */
++	if (!pick_ret)
++		goto hole;
++
++	if (pick_ret < 0) {
++		bch_err_inum_ratelimited(c, k.k->p.inode,
++					 "no device to read from");
++		goto err;
++	}
++
++	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
++
++	/*
++	 * Stale dirty pointers are treated as IO errors, but @failed isn't
++	 * allocated unless we're in the retry path - so if we're not in the
++	 * retry path, don't check here, it'll be caught in bch2_read_endio()
++	 * and we'll end up in the retry path:
++	 */
++	if ((flags & BCH_READ_IN_RETRY) &&
++	    !pick.ptr.cached &&
++	    unlikely(ptr_stale(ca, &pick.ptr))) {
++		read_from_stale_dirty_pointer(trans, k, pick.ptr);
++		bch2_mark_io_failure(failed, &pick);
++		goto retry_pick;
++	}
++
++	/*
++	 * Unlock the iterator while the btree node's lock is still in
++	 * cache, before doing the IO:
++	 */
++	bch2_trans_unlock(trans);
++
++	if (flags & BCH_READ_NODECODE) {
++		/*
++		 * can happen if we retry, and the extent we were going to read
++		 * has been merged in the meantime:
++		 */
++		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
++			goto hole;
++
++		iter.bi_size	= pick.crc.compressed_size << 9;
++		goto get_bio;
++	}
++
++	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
++	    bio_flagged(&orig->bio, BIO_CHAIN))
++		flags |= BCH_READ_MUST_CLONE;
++
++	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
++		bch2_can_narrow_extent_crcs(k, pick.crc);
++
++	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
++		flags |= BCH_READ_MUST_BOUNCE;
++
++	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
++
++	if (crc_is_compressed(pick.crc) ||
++	    (pick.crc.csum_type != BCH_CSUM_none &&
++	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
++	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
++	       (flags & BCH_READ_USER_MAPPED)) ||
++	      (flags & BCH_READ_MUST_BOUNCE)))) {
++		read_full = true;
++		bounce = true;
++	}
++
++	if (orig->opts.promote_target)
++		promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
++					&rbio, &bounce, &read_full);
++
++	if (!read_full) {
++		EBUG_ON(crc_is_compressed(pick.crc));
++		EBUG_ON(pick.crc.csum_type &&
++			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
++			 bvec_iter_sectors(iter) != pick.crc.live_size ||
++			 pick.crc.offset ||
++			 offset_into_extent));
++
++		data_pos.offset += offset_into_extent;
++		pick.ptr.offset += pick.crc.offset +
++			offset_into_extent;
++		offset_into_extent		= 0;
++		pick.crc.compressed_size	= bvec_iter_sectors(iter);
++		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
++		pick.crc.offset			= 0;
++		pick.crc.live_size		= bvec_iter_sectors(iter);
++		offset_into_extent		= 0;
++	}
++get_bio:
++	if (rbio) {
++		/*
++		 * promote already allocated bounce rbio:
++		 * promote needs to allocate a bio big enough for uncompressing
++		 * data in the write path, but we're not going to use it all
++		 * here:
++		 */
++		EBUG_ON(rbio->bio.bi_iter.bi_size <
++		       pick.crc.compressed_size << 9);
++		rbio->bio.bi_iter.bi_size =
++			pick.crc.compressed_size << 9;
++	} else if (bounce) {
++		unsigned sectors = pick.crc.compressed_size;
++
++		rbio = rbio_init(bio_alloc_bioset(NULL,
++						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
++						  0,
++						  GFP_NOIO,
++						  &c->bio_read_split),
++				 orig->opts);
++
++		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
++		rbio->bounce	= true;
++		rbio->split	= true;
++	} else if (flags & BCH_READ_MUST_CLONE) {
++		/*
++		 * Have to clone if there were any splits, due to error
++		 * reporting issues (if a split errored, and retrying didn't
++		 * work, when it reports the error to its parent (us) we don't
++		 * know if the error was from our bio, and we should retry, or
++		 * from the whole bio, in which case we don't want to retry and
++		 * lose the error)
++		 */
++		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
++						 &c->bio_read_split),
++				 orig->opts);
++		rbio->bio.bi_iter = iter;
++		rbio->split	= true;
++	} else {
++		rbio = orig;
++		rbio->bio.bi_iter = iter;
++		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
++	}
++
++	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
++
++	rbio->c			= c;
++	rbio->submit_time	= local_clock();
++	if (rbio->split)
++		rbio->parent	= orig;
++	else
++		rbio->end_io	= orig->bio.bi_end_io;
++	rbio->bvec_iter		= iter;
++	rbio->offset_into_extent= offset_into_extent;
++	rbio->flags		= flags;
++	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
++	rbio->narrow_crcs	= narrow_crcs;
++	rbio->hole		= 0;
++	rbio->retry		= 0;
++	rbio->context		= 0;
++	/* XXX: only initialize this if needed */
++	rbio->devs_have		= bch2_bkey_devs(k);
++	rbio->pick		= pick;
++	rbio->subvol		= orig->subvol;
++	rbio->read_pos		= read_pos;
++	rbio->data_btree	= data_btree;
++	rbio->data_pos		= data_pos;
++	rbio->version		= k.k->version;
++	rbio->promote		= promote;
++	INIT_WORK(&rbio->work, NULL);
++
++	rbio->bio.bi_opf	= orig->bio.bi_opf;
++	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
++	rbio->bio.bi_end_io	= bch2_read_endio;
++
++	if (rbio->bounce)
++		trace_read_bounce(&rbio->bio);
++
++	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
++	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
++
++	/*
++	 * If it's being moved internally, we don't want to flag it as a cache
++	 * hit:
++	 */
++	if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
++		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
++			PTR_BUCKET_NR(ca, &pick.ptr), READ);
++
++	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
++		bio_inc_remaining(&orig->bio);
++		trace_read_split(&orig->bio);
++	}
++
++	if (!rbio->pick.idx) {
++		if (!rbio->have_ioref) {
++			bch_err_inum_ratelimited(c, k.k->p.inode,
++						 "no device to read from");
++			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
++			goto out;
++		}
++
++		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
++			     bio_sectors(&rbio->bio));
++		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
++
++		if (likely(!(flags & BCH_READ_IN_RETRY)))
++			submit_bio(&rbio->bio);
++		else
++			submit_bio_wait(&rbio->bio);
++	} else {
++		/* Attempting reconstruct read: */
++		if (bch2_ec_read_extent(c, rbio)) {
++			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
++			goto out;
++		}
++
++		if (likely(!(flags & BCH_READ_IN_RETRY)))
++			bio_endio(&rbio->bio);
++	}
++out:
++	if (likely(!(flags & BCH_READ_IN_RETRY))) {
++		return 0;
++	} else {
++		int ret;
++
++		rbio->context = RBIO_CONTEXT_UNBOUND;
++		bch2_read_endio(&rbio->bio);
++
++		ret = rbio->retry;
++		rbio = bch2_rbio_free(rbio);
++
++		if (ret == READ_RETRY_AVOID) {
++			bch2_mark_io_failure(failed, &pick);
++			ret = READ_RETRY;
++		}
++
++		if (!ret)
++			goto out_read_done;
++
++		return ret;
++	}
++
++err:
++	if (flags & BCH_READ_IN_RETRY)
++		return READ_ERR;
++
++	orig->bio.bi_status = BLK_STS_IOERR;
++	goto out_read_done;
++
++hole:
++	/*
++	 * won't normally happen in the BCH_READ_NODECODE
++	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
++	 * to read no longer exists we have to signal that:
++	 */
++	if (flags & BCH_READ_NODECODE)
++		orig->hole = true;
++
++	zero_fill_bio_iter(&orig->bio, iter);
++out_read_done:
++	if (flags & BCH_READ_LAST_FRAGMENT)
++		bch2_rbio_done(orig);
++	return 0;
++}
++
++void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
++		 struct bvec_iter bvec_iter, subvol_inum inum,
++		 struct bch_io_failures *failed, unsigned flags)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_buf sk;
++	struct bkey_s_c k;
++	u32 snapshot;
++	int ret;
++
++	BUG_ON(flags & BCH_READ_NODECODE);
++
++	bch2_bkey_buf_init(&sk);
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++	iter = (struct btree_iter) { NULL };
++
++	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
++			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
++			     BTREE_ITER_SLOTS);
++	while (1) {
++		unsigned bytes, sectors, offset_into_extent;
++		enum btree_id data_btree = BTREE_ID_extents;
++
++		/*
++		 * read_extent -> io_time_reset may cause a transaction restart
++		 * without returning an error, we need to check for that here:
++		 */
++		ret = bch2_trans_relock(&trans);
++		if (ret)
++			break;
++
++		bch2_btree_iter_set_pos(&iter,
++				POS(inum.inum, bvec_iter.bi_sector));
++
++		k = bch2_btree_iter_peek_slot(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			break;
++
++		offset_into_extent = iter.pos.offset -
++			bkey_start_offset(k.k);
++		sectors = k.k->size - offset_into_extent;
++
++		bch2_bkey_buf_reassemble(&sk, c, k);
++
++		ret = bch2_read_indirect_extent(&trans, &data_btree,
++					&offset_into_extent, &sk);
++		if (ret)
++			break;
++
++		k = bkey_i_to_s_c(sk.k);
++
++		/*
++		 * With indirect extents, the amount of data to read is the min
++		 * of the original extent and the indirect extent:
++		 */
++		sectors = min(sectors, k.k->size - offset_into_extent);
++
++		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
++		swap(bvec_iter.bi_size, bytes);
++
++		if (bvec_iter.bi_size == bytes)
++			flags |= BCH_READ_LAST_FRAGMENT;
++
++		ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
++					 data_btree, k,
++					 offset_into_extent, failed, flags);
++		if (ret)
++			break;
++
++		if (flags & BCH_READ_LAST_FRAGMENT)
++			break;
++
++		swap(bvec_iter.bi_size, bytes);
++		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
++
++		ret = btree_trans_too_many_iters(&trans);
++		if (ret)
++			break;
++	}
++err:
++	bch2_trans_iter_exit(&trans, &iter);
++
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
++	    ret == READ_RETRY ||
++	    ret == READ_RETRY_AVOID)
++		goto retry;
++
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&sk, c);
++
++	if (ret) {
++		bch_err_inum_ratelimited(c, inum.inum,
++					 "read error %i from btree lookup", ret);
++		rbio->bio.bi_status = BLK_STS_IOERR;
++		bch2_rbio_done(rbio);
++	}
++}
++
++void bch2_fs_io_exit(struct bch_fs *c)
++{
++	if (c->promote_table.tbl)
++		rhashtable_destroy(&c->promote_table);
++	mempool_exit(&c->bio_bounce_pages);
++	bioset_exit(&c->bio_write);
++	bioset_exit(&c->bio_read_split);
++	bioset_exit(&c->bio_read);
++}
++
++int bch2_fs_io_init(struct bch_fs *c)
++{
++	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
++			BIOSET_NEED_BVECS) ||
++	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
++			BIOSET_NEED_BVECS) ||
++	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
++			BIOSET_NEED_BVECS) ||
++	    mempool_init_page_pool(&c->bio_bounce_pages,
++				   max_t(unsigned,
++					 c->opts.btree_node_size,
++					 c->opts.encoded_extent_max) /
++				   PAGE_SIZE, 0) ||
++	    rhashtable_init(&c->promote_table, &bch_promote_params))
++		return -ENOMEM;
++
++	return 0;
++}
+diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
+new file mode 100644
+index 000000000000..fb5114518666
+--- /dev/null
++++ b/fs/bcachefs/io.h
+@@ -0,0 +1,189 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_IO_H
++#define _BCACHEFS_IO_H
++
++#include "checksum.h"
++#include "bkey_buf.h"
++#include "io_types.h"
++
++#define to_wbio(_bio)			\
++	container_of((_bio), struct bch_write_bio, bio)
++
++#define to_rbio(_bio)			\
++	container_of((_bio), struct bch_read_bio, bio)
++
++void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
++void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
++
++void bch2_latency_acct(struct bch_dev *, u64, int);
++
++void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
++			       enum bch_data_type, const struct bkey_i *);
++
++#define BLK_STS_REMOVED		((__force blk_status_t)128)
++
++const char *bch2_blk_status_to_str(blk_status_t);
++
++enum bch_write_flags {
++	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
++	BCH_WRITE_CACHED		= (1 << 1),
++	BCH_WRITE_FLUSH			= (1 << 2),
++	BCH_WRITE_DATA_ENCODED		= (1 << 3),
++	BCH_WRITE_PAGES_STABLE		= (1 << 4),
++	BCH_WRITE_PAGES_OWNED		= (1 << 5),
++	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
++	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 7),
++	BCH_WRITE_FROM_INTERNAL		= (1 << 8),
++	BCH_WRITE_CHECK_ENOSPC		= (1 << 9),
++
++	/* Internal: */
++	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 10),
++	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 11),
++	BCH_WRITE_DONE			= (1 << 12),
++};
++
++static inline u64 *op_journal_seq(struct bch_write_op *op)
++{
++	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
++		? op->journal_seq_p : &op->journal_seq;
++}
++
++static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
++{
++	return op->alloc_reserve == RESERVE_movinggc
++		? op->c->copygc_wq
++		: op->c->btree_update_wq;
++}
++
++int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
++			       struct bkey_i *, bool *, s64 *, s64 *);
++int bch2_extent_update(struct btree_trans *, subvol_inum,
++		       struct btree_iter *, struct bkey_i *,
++		       struct disk_reservation *, u64 *, u64, s64 *, bool);
++
++int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
++		   subvol_inum, u64, s64 *);
++int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
++
++int bch2_write_index_default(struct bch_write_op *);
++
++static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
++				      struct bch_io_opts opts)
++{
++	op->c			= c;
++	op->end_io		= NULL;
++	op->flags		= 0;
++	op->written		= 0;
++	op->error		= 0;
++	op->csum_type		= bch2_data_checksum_type(c, opts.data_checksum);
++	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
++	op->nr_replicas		= 0;
++	op->nr_replicas_required = c->opts.data_replicas_required;
++	op->alloc_reserve	= RESERVE_none;
++	op->incompressible	= 0;
++	op->open_buckets.nr	= 0;
++	op->devs_have.nr	= 0;
++	op->target		= 0;
++	op->opts		= opts;
++	op->subvol		= 0;
++	op->pos			= POS_MAX;
++	op->version		= ZERO_VERSION;
++	op->write_point		= (struct write_point_specifier) { 0 };
++	op->res			= (struct disk_reservation) { 0 };
++	op->journal_seq		= 0;
++	op->new_i_size		= U64_MAX;
++	op->i_sectors_delta	= 0;
++	op->index_update_fn	= bch2_write_index_default;
++}
++
++void bch2_write(struct closure *);
++
++static inline struct bch_write_bio *wbio_init(struct bio *bio)
++{
++	struct bch_write_bio *wbio = to_wbio(bio);
++
++	memset(wbio, 0, offsetof(struct bch_write_bio, bio));
++	return wbio;
++}
++
++struct bch_devs_mask;
++struct cache_promote_op;
++struct extent_ptr_decoded;
++
++int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
++				struct bkey_buf *);
++
++static inline int bch2_read_indirect_extent(struct btree_trans *trans,
++					    enum btree_id *data_btree,
++					    unsigned *offset_into_extent,
++					    struct bkey_buf *k)
++{
++	if (k->k->k.type != KEY_TYPE_reflink_p)
++		return 0;
++
++	*data_btree = BTREE_ID_reflink;
++	return __bch2_read_indirect_extent(trans, offset_into_extent, k);
++}
++
++enum bch_read_flags {
++	BCH_READ_RETRY_IF_STALE		= 1 << 0,
++	BCH_READ_MAY_PROMOTE		= 1 << 1,
++	BCH_READ_USER_MAPPED		= 1 << 2,
++	BCH_READ_NODECODE		= 1 << 3,
++	BCH_READ_LAST_FRAGMENT		= 1 << 4,
++
++	/* internal: */
++	BCH_READ_MUST_BOUNCE		= 1 << 5,
++	BCH_READ_MUST_CLONE		= 1 << 6,
++	BCH_READ_IN_RETRY		= 1 << 7,
++};
++
++int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
++		       struct bvec_iter, struct bpos, enum btree_id,
++		       struct bkey_s_c, unsigned,
++		       struct bch_io_failures *, unsigned);
++
++static inline void bch2_read_extent(struct btree_trans *trans,
++			struct bch_read_bio *rbio, struct bpos read_pos,
++			enum btree_id data_btree, struct bkey_s_c k,
++			unsigned offset_into_extent, unsigned flags)
++{
++	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
++			   data_btree, k, offset_into_extent, NULL, flags);
++}
++
++void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
++		 subvol_inum, struct bch_io_failures *, unsigned flags);
++
++static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
++			     subvol_inum inum)
++{
++	struct bch_io_failures failed = { .nr = 0 };
++
++	BUG_ON(rbio->_state);
++
++	rbio->c = c;
++	rbio->start_time = local_clock();
++	rbio->subvol = inum.subvol;
++
++	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
++		    BCH_READ_RETRY_IF_STALE|
++		    BCH_READ_MAY_PROMOTE|
++		    BCH_READ_USER_MAPPED);
++}
++
++static inline struct bch_read_bio *rbio_init(struct bio *bio,
++					     struct bch_io_opts opts)
++{
++	struct bch_read_bio *rbio = to_rbio(bio);
++
++	rbio->_state	= 0;
++	rbio->promote	= NULL;
++	rbio->opts	= opts;
++	return rbio;
++}
++
++void bch2_fs_io_exit(struct bch_fs *);
++int bch2_fs_io_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_IO_H */
+diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
+new file mode 100644
+index 000000000000..78bff13d36f2
+--- /dev/null
++++ b/fs/bcachefs/io_types.h
+@@ -0,0 +1,161 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_IO_TYPES_H
++#define _BCACHEFS_IO_TYPES_H
++
++#include "alloc_types.h"
++#include "btree_types.h"
++#include "buckets_types.h"
++#include "extents_types.h"
++#include "keylist_types.h"
++#include "opts.h"
++#include "super_types.h"
++
++#include <linux/llist.h>
++#include <linux/workqueue.h>
++
++struct bch_read_bio {
++	struct bch_fs		*c;
++	u64			start_time;
++	u64			submit_time;
++
++	/*
++	 * Reads will often have to be split, and if the extent being read from
++	 * was checksummed or compressed we'll also have to allocate bounce
++	 * buffers and copy the data back into the original bio.
++	 *
++	 * If we didn't have to split, we have to save and restore the original
++	 * bi_end_io - @split below indicates which:
++	 */
++	union {
++	struct bch_read_bio	*parent;
++	bio_end_io_t		*end_io;
++	};
++
++	/*
++	 * Saved copy of bio->bi_iter, from submission time - allows us to
++	 * resubmit on IO error, and also to copy data back to the original bio
++	 * when we're bouncing:
++	 */
++	struct bvec_iter	bvec_iter;
++
++	unsigned		offset_into_extent;
++
++	u16			flags;
++	union {
++	struct {
++	u16			bounce:1,
++				split:1,
++				kmalloc:1,
++				have_ioref:1,
++				narrow_crcs:1,
++				hole:1,
++				retry:2,
++				context:2;
++	};
++	u16			_state;
++	};
++
++	struct bch_devs_list	devs_have;
++
++	struct extent_ptr_decoded pick;
++
++	/*
++	 * pos we read from - different from data_pos for indirect extents:
++	 */
++	u32			subvol;
++	struct bpos		read_pos;
++
++	/*
++	 * start pos of data we read (may not be pos of data we want) - for
++	 * promote, narrow extents paths:
++	 */
++	enum btree_id		data_btree;
++	struct bpos		data_pos;
++	struct bversion		version;
++
++	struct promote_op	*promote;
++
++	struct bch_io_opts	opts;
++
++	struct work_struct	work;
++
++	struct bio		bio;
++};
++
++struct bch_write_bio {
++	struct bch_fs		*c;
++	struct bch_write_bio	*parent;
++
++	u64			submit_time;
++
++	struct bch_devs_list	failed;
++	u8			dev;
++
++	unsigned		split:1,
++				bounce:1,
++				put_bio:1,
++				have_ioref:1,
++				used_mempool:1,
++				first_btree_write:1;
++
++	struct bio		bio;
++};
++
++struct bch_write_op {
++	struct closure		cl;
++	struct bch_fs		*c;
++	void			(*end_io)(struct bch_write_op *);
++	u64			start_time;
++
++	unsigned		written; /* sectors */
++	u16			flags;
++	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
++
++	unsigned		csum_type:4;
++	unsigned		compression_type:4;
++	unsigned		nr_replicas:4;
++	unsigned		nr_replicas_required:4;
++	unsigned		alloc_reserve:3;
++	unsigned		incompressible:1;
++
++	struct bch_devs_list	devs_have;
++	u16			target;
++	u16			nonce;
++	struct bch_io_opts	opts;
++
++	u32			subvol;
++	struct bpos		pos;
++	struct bversion		version;
++
++	/* For BCH_WRITE_DATA_ENCODED: */
++	struct bch_extent_crc_unpacked crc;
++
++	struct write_point_specifier write_point;
++
++	struct disk_reservation	res;
++
++	struct open_buckets	open_buckets;
++
++	/*
++	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
++	 * still need to stash the journal_seq somewhere:
++	 */
++	union {
++		u64			*journal_seq_p;
++		u64			journal_seq;
++	};
++	u64			new_i_size;
++	s64			i_sectors_delta;
++
++	int			(*index_update_fn)(struct bch_write_op *);
++
++	struct bch_devs_mask	failed;
++
++	struct keylist		insert_keys;
++	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
++
++	/* Must be last: */
++	struct bch_write_bio	wbio;
++};
++
++#endif /* _BCACHEFS_IO_TYPES_H */
+diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
+new file mode 100644
+index 000000000000..937ed1395e46
+--- /dev/null
++++ b/fs/bcachefs/journal.c
+@@ -0,0 +1,1429 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * bcachefs journalling code, for btree insertions
++ *
++ * Copyright 2012 Google, Inc.
++ */
++
++#include "bcachefs.h"
++#include "alloc_foreground.h"
++#include "bkey_methods.h"
++#include "btree_gc.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "error.h"
++#include "journal.h"
++#include "journal_io.h"
++#include "journal_reclaim.h"
++#include "journal_sb.h"
++#include "journal_seq_blacklist.h"
++
++#include <trace/events/bcachefs.h>
++
++#define x(n)	#n,
++static const char * const bch2_journal_watermarks[] = {
++	JOURNAL_WATERMARKS()
++	NULL
++};
++
++static const char * const bch2_journal_errors[] = {
++	JOURNAL_ERRORS()
++	NULL
++};
++#undef x
++
++static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
++{
++	return seq > j->seq_ondisk;
++}
++
++static bool __journal_entry_is_open(union journal_res_state state)
++{
++	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
++}
++
++static inline unsigned nr_unwritten_journal_entries(struct journal *j)
++{
++	return atomic64_read(&j->seq) - j->seq_ondisk;
++}
++
++static bool journal_entry_is_open(struct journal *j)
++{
++	return __journal_entry_is_open(j->reservations);
++}
++
++static inline struct journal_buf *
++journal_seq_to_buf(struct journal *j, u64 seq)
++{
++	struct journal_buf *buf = NULL;
++
++	EBUG_ON(seq > journal_cur_seq(j));
++
++	if (journal_seq_unwritten(j, seq)) {
++		buf = j->buf + (seq & JOURNAL_BUF_MASK);
++		EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
++	}
++	return buf;
++}
++
++static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
++{
++	INIT_LIST_HEAD(&p->list);
++	INIT_LIST_HEAD(&p->key_cache_list);
++	INIT_LIST_HEAD(&p->flushed);
++	atomic_set(&p->count, count);
++	p->devs.nr = 0;
++}
++
++/* journal entry close/open: */
++
++void __bch2_journal_buf_put(struct journal *j)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++
++	closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
++}
++
++/*
++ * Returns true if journal entry is now closed:
++ *
++ * We don't close a journal_buf until the next journal_buf is finished writing,
++ * and can be opened again - this also initializes the next journal_buf:
++ */
++static void __journal_entry_close(struct journal *j, unsigned closed_val)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct journal_buf *buf = journal_cur_buf(j);
++	union journal_res_state old, new;
++	u64 v = atomic64_read(&j->reservations.counter);
++	unsigned sectors;
++
++	BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
++	       closed_val != JOURNAL_ENTRY_ERROR_VAL);
++
++	lockdep_assert_held(&j->lock);
++
++	do {
++		old.v = new.v = v;
++		new.cur_entry_offset = closed_val;
++
++		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
++		    old.cur_entry_offset == new.cur_entry_offset)
++			return;
++	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
++				       old.v, new.v)) != old.v);
++
++	if (!__journal_entry_is_open(old))
++		return;
++
++	/* Close out old buffer: */
++	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
++
++	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
++				      buf->u64s_reserved) << c->block_bits;
++	BUG_ON(sectors > buf->sectors);
++	buf->sectors = sectors;
++
++	/*
++	 * We have to set last_seq here, _before_ opening a new journal entry:
++	 *
++	 * A threads may replace an old pin with a new pin on their current
++	 * journal reservation - the expectation being that the journal will
++	 * contain either what the old pin protected or what the new pin
++	 * protects.
++	 *
++	 * After the old pin is dropped journal_last_seq() won't include the old
++	 * pin, so we can only write the updated last_seq on the entry that
++	 * contains whatever the new pin protects.
++	 *
++	 * Restated, we can _not_ update last_seq for a given entry if there
++	 * could be a newer entry open with reservations/pins that have been
++	 * taken against it.
++	 *
++	 * Hence, we want update/set last_seq on the current journal entry right
++	 * before we open a new one:
++	 */
++	buf->last_seq		= journal_last_seq(j);
++	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
++	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
++
++	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
++
++	cancel_delayed_work(&j->write_work);
++
++	bch2_journal_space_available(j);
++
++	bch2_journal_buf_put(j, old.idx);
++}
++
++void bch2_journal_halt(struct journal *j)
++{
++	spin_lock(&j->lock);
++	__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
++	if (!j->err_seq)
++		j->err_seq = journal_cur_seq(j);
++	spin_unlock(&j->lock);
++}
++
++static bool journal_entry_want_write(struct journal *j)
++{
++	bool ret = !journal_entry_is_open(j) ||
++		journal_cur_seq(j) == journal_last_unwritten_seq(j);
++
++	/* Don't close it yet if we already have a write in flight: */
++	if (ret)
++		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
++	else if (nr_unwritten_journal_entries(j)) {
++		struct journal_buf *buf = journal_cur_buf(j);
++
++		if (!buf->flush_time) {
++			buf->flush_time	= local_clock() ?: 1;
++			buf->expires = jiffies;
++		}
++	}
++
++	return ret;
++}
++
++static bool journal_entry_close(struct journal *j)
++{
++	bool ret;
++
++	spin_lock(&j->lock);
++	ret = journal_entry_want_write(j);
++	spin_unlock(&j->lock);
++
++	return ret;
++}
++
++/*
++ * should _only_ called from journal_res_get() - when we actually want a
++ * journal reservation - journal entry is open means journal is dirty:
++ *
++ * returns:
++ * 0:		success
++ * -ENOSPC:	journal currently full, must invoke reclaim
++ * -EAGAIN:	journal blocked, must wait
++ * -EROFS:	insufficient rw devices or journal error
++ */
++static int journal_entry_open(struct journal *j)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct journal_buf *buf = j->buf +
++		((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
++	union journal_res_state old, new;
++	int u64s;
++	u64 v;
++
++	lockdep_assert_held(&j->lock);
++	BUG_ON(journal_entry_is_open(j));
++	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
++
++	if (j->blocked)
++		return JOURNAL_ERR_blocked;
++
++	if (j->cur_entry_error)
++		return j->cur_entry_error;
++
++	if (bch2_journal_error(j))
++		return JOURNAL_ERR_insufficient_devices; /* -EROFS */
++
++	if (!fifo_free(&j->pin))
++		return JOURNAL_ERR_journal_pin_full;
++
++	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1)
++		return JOURNAL_ERR_max_in_flight;
++
++	BUG_ON(!j->cur_entry_sectors);
++
++	buf->expires		=
++		(journal_cur_seq(j) == j->flushed_seq_ondisk
++		 ? jiffies
++		 : j->last_flush_write) +
++		msecs_to_jiffies(c->opts.journal_flush_delay);
++
++	buf->u64s_reserved	= j->entry_u64s_reserved;
++	buf->disk_sectors	= j->cur_entry_sectors;
++	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
++
++	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
++		journal_entry_overhead(j);
++	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
++
++	if (u64s <= 0)
++		return JOURNAL_ERR_journal_full;
++
++	if (fifo_empty(&j->pin) && j->reclaim_thread)
++		wake_up_process(j->reclaim_thread);
++
++	/*
++	 * The fifo_push() needs to happen at the same time as j->seq is
++	 * incremented for journal_last_seq() to be calculated correctly
++	 */
++	atomic64_inc(&j->seq);
++	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
++
++	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
++
++	bkey_extent_init(&buf->key);
++	buf->noflush	= false;
++	buf->must_flush	= false;
++	buf->separate_flush = false;
++	buf->flush_time	= 0;
++
++	memset(buf->data, 0, sizeof(*buf->data));
++	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
++	buf->data->u64s	= 0;
++
++	/*
++	 * Must be set before marking the journal entry as open:
++	 */
++	j->cur_entry_u64s = u64s;
++
++	v = atomic64_read(&j->reservations.counter);
++	do {
++		old.v = new.v = v;
++
++		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
++
++		new.idx++;
++		BUG_ON(journal_state_count(new, new.idx));
++		BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
++
++		journal_state_inc(&new);
++		new.cur_entry_offset = 0;
++	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
++				       old.v, new.v)) != old.v);
++
++	if (j->res_get_blocked_start)
++		bch2_time_stats_update(j->blocked_time,
++				       j->res_get_blocked_start);
++	j->res_get_blocked_start = 0;
++
++	mod_delayed_work(c->io_complete_wq,
++			 &j->write_work,
++			 msecs_to_jiffies(c->opts.journal_flush_delay));
++	journal_wake(j);
++	return 0;
++}
++
++static bool journal_quiesced(struct journal *j)
++{
++	bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
++
++	if (!ret)
++		journal_entry_close(j);
++	return ret;
++}
++
++static void journal_quiesce(struct journal *j)
++{
++	wait_event(j->wait, journal_quiesced(j));
++}
++
++static void journal_write_work(struct work_struct *work)
++{
++	struct journal *j = container_of(work, struct journal, write_work.work);
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	long delta;
++
++	spin_lock(&j->lock);
++	if (!__journal_entry_is_open(j->reservations))
++		goto unlock;
++
++	delta = journal_cur_buf(j)->expires - jiffies;
++
++	if (delta > 0)
++		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
++	else
++		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
++unlock:
++	spin_unlock(&j->lock);
++}
++
++static int __journal_res_get(struct journal *j, struct journal_res *res,
++			     unsigned flags)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct journal_buf *buf;
++	bool can_discard;
++	int ret;
++retry:
++	if (journal_res_get_fast(j, res, flags))
++		return 0;
++
++	if (bch2_journal_error(j))
++		return -EROFS;
++
++	spin_lock(&j->lock);
++
++	/*
++	 * Recheck after taking the lock, so we don't race with another thread
++	 * that just did journal_entry_open() and call journal_entry_close()
++	 * unnecessarily
++	 */
++	if (journal_res_get_fast(j, res, flags)) {
++		spin_unlock(&j->lock);
++		return 0;
++	}
++
++	if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) {
++		/*
++		 * Don't want to close current journal entry, just need to
++		 * invoke reclaim:
++		 */
++		ret = JOURNAL_ERR_journal_full;
++		goto unlock;
++	}
++
++	/*
++	 * If we couldn't get a reservation because the current buf filled up,
++	 * and we had room for a bigger entry on disk, signal that we want to
++	 * realloc the journal bufs:
++	 */
++	buf = journal_cur_buf(j);
++	if (journal_entry_is_open(j) &&
++	    buf->buf_size >> 9 < buf->disk_sectors &&
++	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
++		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
++
++	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
++	ret = journal_entry_open(j);
++
++	if (ret == JOURNAL_ERR_max_in_flight)
++		trace_journal_entry_full(c);
++unlock:
++	if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
++	    !j->res_get_blocked_start) {
++		j->res_get_blocked_start = local_clock() ?: 1;
++		trace_journal_full(c);
++	}
++
++	can_discard = j->can_discard;
++	spin_unlock(&j->lock);
++
++	if (!ret)
++		goto retry;
++
++	if ((ret == JOURNAL_ERR_journal_full ||
++	     ret == JOURNAL_ERR_journal_pin_full) &&
++	    !can_discard &&
++	    !nr_unwritten_journal_entries(j) &&
++	    (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) {
++		struct printbuf buf = PRINTBUF;
++
++		bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)",
++			bch2_journal_errors[ret]);
++
++		bch2_journal_debug_to_text(&buf, j);
++		bch_err(c, "%s", buf.buf);
++
++		printbuf_reset(&buf);
++		bch2_journal_pins_to_text(&buf, j);
++		bch_err(c, "Journal pins:\n%s", buf.buf);
++
++		printbuf_exit(&buf);
++		bch2_fatal_error(c);
++		dump_stack();
++	}
++
++	/*
++	 * Journal is full - can't rely on reclaim from work item due to
++	 * freezing:
++	 */
++	if ((ret == JOURNAL_ERR_journal_full ||
++	     ret == JOURNAL_ERR_journal_pin_full) &&
++	    !(flags & JOURNAL_RES_GET_NONBLOCK)) {
++		if (can_discard) {
++			bch2_journal_do_discards(j);
++			goto retry;
++		}
++
++		if (mutex_trylock(&j->reclaim_lock)) {
++			bch2_journal_reclaim(j);
++			mutex_unlock(&j->reclaim_lock);
++		}
++	}
++
++	return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN;
++}
++
++/*
++ * Essentially the entry function to the journaling code. When bcachefs is doing
++ * a btree insert, it calls this function to get the current journal write.
++ * Journal write is the structure used set up journal writes. The calling
++ * function will then add its keys to the structure, queuing them for the next
++ * write.
++ *
++ * To ensure forward progress, the current task must not be holding any
++ * btree node write locks.
++ */
++int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
++				  unsigned flags)
++{
++	int ret;
++
++	closure_wait_event(&j->async_wait,
++		   (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
++		   (flags & JOURNAL_RES_GET_NONBLOCK));
++	return ret;
++}
++
++/* journal_preres: */
++
++static bool journal_preres_available(struct journal *j,
++				     struct journal_preres *res,
++				     unsigned new_u64s,
++				     unsigned flags)
++{
++	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
++
++	if (!ret && mutex_trylock(&j->reclaim_lock)) {
++		bch2_journal_reclaim(j);
++		mutex_unlock(&j->reclaim_lock);
++	}
++
++	return ret;
++}
++
++int __bch2_journal_preres_get(struct journal *j,
++			      struct journal_preres *res,
++			      unsigned new_u64s,
++			      unsigned flags)
++{
++	int ret;
++
++	closure_wait_event(&j->preres_wait,
++		   (ret = bch2_journal_error(j)) ||
++		   journal_preres_available(j, res, new_u64s, flags));
++	return ret;
++}
++
++/* journal_entry_res: */
++
++void bch2_journal_entry_res_resize(struct journal *j,
++				   struct journal_entry_res *res,
++				   unsigned new_u64s)
++{
++	union journal_res_state state;
++	int d = new_u64s - res->u64s;
++
++	spin_lock(&j->lock);
++
++	j->entry_u64s_reserved += d;
++	if (d <= 0)
++		goto out;
++
++	j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
++	smp_mb();
++	state = READ_ONCE(j->reservations);
++
++	if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
++	    state.cur_entry_offset > j->cur_entry_u64s) {
++		j->cur_entry_u64s += d;
++		/*
++		 * Not enough room in current journal entry, have to flush it:
++		 */
++		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
++	} else {
++		journal_cur_buf(j)->u64s_reserved += d;
++	}
++out:
++	spin_unlock(&j->lock);
++	res->u64s += d;
++}
++
++/* journal flushing: */
++
++/**
++ * bch2_journal_flush_seq_async - wait for a journal entry to be written
++ *
++ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
++ * necessary
++ */
++int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
++				 struct closure *parent)
++{
++	struct journal_buf *buf;
++	int ret = 0;
++
++	if (seq <= j->flushed_seq_ondisk)
++		return 1;
++
++	spin_lock(&j->lock);
++
++	if (WARN_ONCE(seq > journal_cur_seq(j),
++		      "requested to flush journal seq %llu, but currently at %llu",
++		      seq, journal_cur_seq(j)))
++		goto out;
++
++	/* Recheck under lock: */
++	if (j->err_seq && seq >= j->err_seq) {
++		ret = -EIO;
++		goto out;
++	}
++
++	if (seq <= j->flushed_seq_ondisk) {
++		ret = 1;
++		goto out;
++	}
++
++	/* if seq was written, but not flushed - flush a newer one instead */
++	seq = max(seq, journal_last_unwritten_seq(j));
++
++recheck_need_open:
++	if (seq > journal_cur_seq(j)) {
++		struct journal_res res = { 0 };
++
++		if (journal_entry_is_open(j))
++			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
++
++		spin_unlock(&j->lock);
++
++		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
++		if (ret)
++			return ret;
++
++		seq = res.seq;
++		buf = j->buf + (seq & JOURNAL_BUF_MASK);
++		buf->must_flush = true;
++
++		if (!buf->flush_time) {
++			buf->flush_time	= local_clock() ?: 1;
++			buf->expires = jiffies;
++		}
++
++		if (parent && !closure_wait(&buf->wait, parent))
++			BUG();
++
++		bch2_journal_res_put(j, &res);
++
++		spin_lock(&j->lock);
++		goto want_write;
++	}
++
++	/*
++	 * if write was kicked off without a flush, flush the next sequence
++	 * number instead
++	 */
++	buf = journal_seq_to_buf(j, seq);
++	if (buf->noflush) {
++		seq++;
++		goto recheck_need_open;
++	}
++
++	buf->must_flush = true;
++
++	if (parent && !closure_wait(&buf->wait, parent))
++		BUG();
++want_write:
++	if (seq == journal_cur_seq(j))
++		journal_entry_want_write(j);
++out:
++	spin_unlock(&j->lock);
++	return ret;
++}
++
++int bch2_journal_flush_seq(struct journal *j, u64 seq)
++{
++	u64 start_time = local_clock();
++	int ret, ret2;
++
++	/*
++	 * Don't update time_stats when @seq is already flushed:
++	 */
++	if (seq <= j->flushed_seq_ondisk)
++		return 0;
++
++	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
++
++	if (!ret)
++		bch2_time_stats_update(j->flush_seq_time, start_time);
++
++	return ret ?: ret2 < 0 ? ret2 : 0;
++}
++
++/*
++ * bch2_journal_flush_async - if there is an open journal entry, or a journal
++ * still being written, write it and wait for the write to complete
++ */
++void bch2_journal_flush_async(struct journal *j, struct closure *parent)
++{
++	bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
++}
++
++int bch2_journal_flush(struct journal *j)
++{
++	return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
++}
++
++/*
++ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
++ * @seq
++ */
++bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	u64 unwritten_seq;
++	bool ret = false;
++
++	if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
++		return false;
++
++	if (seq <= c->journal.flushed_seq_ondisk)
++		return false;
++
++	spin_lock(&j->lock);
++	if (seq <= c->journal.flushed_seq_ondisk)
++		goto out;
++
++	for (unwritten_seq = journal_last_unwritten_seq(j);
++	     unwritten_seq < seq;
++	     unwritten_seq++) {
++		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
++
++		/* journal write is already in flight, and was a flush write: */
++		if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
++			goto out;
++
++		buf->noflush = true;
++	}
++
++	ret = true;
++out:
++	spin_unlock(&j->lock);
++	return ret;
++}
++
++int bch2_journal_meta(struct journal *j)
++{
++	struct journal_buf *buf;
++	struct journal_res res;
++	int ret;
++
++	memset(&res, 0, sizeof(res));
++
++	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
++	if (ret)
++		return ret;
++
++	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
++	buf->must_flush = true;
++
++	if (!buf->flush_time) {
++		buf->flush_time	= local_clock() ?: 1;
++		buf->expires = jiffies;
++	}
++
++	bch2_journal_res_put(j, &res);
++
++	return bch2_journal_flush_seq(j, res.seq);
++}
++
++int bch2_journal_log_msg(struct journal *j, const char *fmt, ...)
++{
++	struct jset_entry_log *entry;
++	struct journal_res res = { 0 };
++	unsigned msglen, u64s;
++	va_list args;
++	int ret;
++
++	va_start(args, fmt);
++	msglen = vsnprintf(NULL, 0, fmt, args) + 1;
++	va_end(args);
++
++	u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64)));
++
++	ret = bch2_journal_res_get(j, &res, u64s, 0);
++	if (ret)
++		return ret;
++
++	entry = container_of(journal_res_entry(j, &res),
++			     struct jset_entry_log, entry);;
++	memset(entry, 0, u64s * sizeof(u64));
++	entry->entry.type = BCH_JSET_ENTRY_log;
++	entry->entry.u64s = u64s - 1;
++
++	va_start(args, fmt);
++	vsnprintf(entry->d, INT_MAX, fmt, args);
++	va_end(args);
++
++	bch2_journal_res_put(j, &res);
++
++	return bch2_journal_flush_seq(j, res.seq);
++}
++
++/* block/unlock the journal: */
++
++void bch2_journal_unblock(struct journal *j)
++{
++	spin_lock(&j->lock);
++	j->blocked--;
++	spin_unlock(&j->lock);
++
++	journal_wake(j);
++}
++
++void bch2_journal_block(struct journal *j)
++{
++	spin_lock(&j->lock);
++	j->blocked++;
++	spin_unlock(&j->lock);
++
++	journal_quiesce(j);
++}
++
++/* allocate journal on a device: */
++
++static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
++					 bool new_fs, struct closure *cl)
++{
++	struct bch_fs *c = ca->fs;
++	struct journal_device *ja = &ca->journal;
++	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
++	struct open_bucket **ob = NULL;
++	long *bu = NULL;
++	unsigned i, nr_got = 0, nr_want = nr - ja->nr;
++	unsigned old_nr			= ja->nr;
++	unsigned old_discard_idx	= ja->discard_idx;
++	unsigned old_dirty_idx_ondisk	= ja->dirty_idx_ondisk;
++	unsigned old_dirty_idx		= ja->dirty_idx;
++	unsigned old_cur_idx		= ja->cur_idx;
++	int ret = 0;
++
++	if (c) {
++		bch2_journal_flush_all_pins(&c->journal);
++		bch2_journal_block(&c->journal);
++	}
++
++	bu		= kzalloc(nr_want * sizeof(*bu), GFP_KERNEL);
++	ob		= kzalloc(nr_want * sizeof(*ob), GFP_KERNEL);
++	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
++	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
++	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
++		ret = -ENOMEM;
++		goto err_unblock;
++	}
++
++	for (nr_got = 0; nr_got < nr_want; nr_got++) {
++		if (new_fs) {
++			bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
++			if (bu[nr_got] < 0) {
++				ret = -ENOSPC;
++				break;
++			}
++		} else {
++			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
++					       false, cl);
++			if (IS_ERR(ob[nr_got])) {
++				ret = cl ? -EAGAIN : -ENOSPC;
++				break;
++			}
++
++			bu[nr_got] = ob[nr_got]->bucket;
++		}
++	}
++
++	if (!nr_got)
++		goto err_unblock;
++
++	/*
++	 * We may be called from the device add path, before the new device has
++	 * actually been added to the running filesystem:
++	 */
++	if (!new_fs)
++		spin_lock(&c->journal.lock);
++
++	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
++	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
++	swap(new_buckets,	ja->buckets);
++	swap(new_bucket_seq,	ja->bucket_seq);
++
++	for (i = 0; i < nr_got; i++) {
++		unsigned pos = ja->discard_idx ?: ja->nr;
++		long b = bu[i];
++
++		__array_insert_item(ja->buckets,		ja->nr, pos);
++		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
++		ja->nr++;
++
++		ja->buckets[pos] = b;
++		ja->bucket_seq[pos] = 0;
++
++		if (pos <= ja->discard_idx)
++			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
++		if (pos <= ja->dirty_idx_ondisk)
++			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
++		if (pos <= ja->dirty_idx)
++			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
++		if (pos <= ja->cur_idx)
++			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
++	}
++
++	ret = bch2_journal_buckets_to_sb(c, ca);
++	if (ret) {
++		/* Revert: */
++		swap(new_buckets,	ja->buckets);
++		swap(new_bucket_seq,	ja->bucket_seq);
++		ja->nr			= old_nr;
++		ja->discard_idx		= old_discard_idx;
++		ja->dirty_idx_ondisk	= old_dirty_idx_ondisk;
++		ja->dirty_idx		= old_dirty_idx;
++		ja->cur_idx		= old_cur_idx;
++	}
++
++	if (!new_fs)
++		spin_unlock(&c->journal.lock);
++
++	if (c)
++		bch2_journal_unblock(&c->journal);
++
++	if (ret)
++		goto err;
++
++	if (!new_fs) {
++		for (i = 0; i < nr_got; i++) {
++			ret = bch2_trans_run(c,
++				bch2_trans_mark_metadata_bucket(&trans, ca,
++						bu[i], BCH_DATA_journal,
++						ca->mi.bucket_size));
++			if (ret) {
++				bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
++				goto err;
++			}
++		}
++	}
++err:
++	if (ob && !new_fs)
++		for (i = 0; i < nr_got; i++)
++			bch2_open_bucket_put(c, ob[i]);
++
++	kfree(new_bucket_seq);
++	kfree(new_buckets);
++	kfree(ob);
++	kfree(bu);
++
++	return ret;
++err_unblock:
++	if (c)
++		bch2_journal_unblock(&c->journal);
++	goto err;
++}
++
++/*
++ * Allocate more journal space at runtime - not currently making use if it, but
++ * the code works:
++ */
++int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
++				unsigned nr)
++{
++	struct journal_device *ja = &ca->journal;
++	struct closure cl;
++	unsigned current_nr;
++	int ret = 0;
++
++	/* don't handle reducing nr of buckets yet: */
++	if (nr < ja->nr)
++		return 0;
++
++	closure_init_stack(&cl);
++
++	while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) {
++		struct disk_reservation disk_res = { 0, 0 };
++
++		closure_sync(&cl);
++
++		mutex_lock(&c->sb_lock);
++		current_nr = ja->nr;
++
++		/*
++		 * note: journal buckets aren't really counted as _sectors_ used yet, so
++		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
++		 * when space used goes up without a reservation - but we do need the
++		 * reservation to ensure we'll actually be able to allocate:
++		 */
++
++		if (bch2_disk_reservation_get(c, &disk_res,
++					      bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
++			mutex_unlock(&c->sb_lock);
++			return -ENOSPC;
++		}
++
++		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
++
++		bch2_disk_reservation_put(c, &disk_res);
++
++		if (ja->nr != current_nr)
++			bch2_write_super(c);
++		mutex_unlock(&c->sb_lock);
++	}
++
++	return ret;
++}
++
++int bch2_dev_journal_alloc(struct bch_dev *ca)
++{
++	unsigned nr;
++	int ret;
++
++	if (dynamic_fault("bcachefs:add:journal_alloc"))
++		return -ENOMEM;
++
++	/* 1/128th of the device by default: */
++	nr = ca->mi.nbuckets >> 7;
++
++	/*
++	 * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
++	 * is smaller:
++	 */
++	nr = clamp_t(unsigned, nr,
++		     BCH_JOURNAL_BUCKETS_MIN,
++		     min(1 << 13,
++			 (1 << 24) / ca->mi.bucket_size));
++
++	if (ca->fs)
++		mutex_lock(&ca->fs->sb_lock);
++
++	ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
++
++	if (ca->fs)
++		mutex_unlock(&ca->fs->sb_lock);
++
++	return ret;
++}
++
++/* startup/shutdown: */
++
++static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
++{
++	bool ret = false;
++	u64 seq;
++
++	spin_lock(&j->lock);
++	for (seq = journal_last_unwritten_seq(j);
++	     seq <= journal_cur_seq(j) && !ret;
++	     seq++) {
++		struct journal_buf *buf = journal_seq_to_buf(j, seq);
++
++		if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
++			ret = true;
++	}
++	spin_unlock(&j->lock);
++
++	return ret;
++}
++
++void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
++{
++	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
++}
++
++void bch2_fs_journal_stop(struct journal *j)
++{
++	bch2_journal_reclaim_stop(j);
++	bch2_journal_flush_all_pins(j);
++
++	wait_event(j->wait, journal_entry_close(j));
++
++	/*
++	 * Always write a new journal entry, to make sure the clock hands are up
++	 * to date (and match the superblock)
++	 */
++	bch2_journal_meta(j);
++
++	journal_quiesce(j);
++
++	BUG_ON(!bch2_journal_error(j) &&
++	       test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
++	       j->last_empty_seq != journal_cur_seq(j));
++
++	cancel_delayed_work_sync(&j->write_work);
++}
++
++int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct journal_entry_pin_list *p;
++	struct journal_replay *i, **_i;
++	struct genradix_iter iter;
++	bool had_entries = false;
++	unsigned ptr;
++	u64 last_seq = cur_seq, nr, seq;
++
++	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
++		i = *_i;
++
++		if (!i || i->ignore)
++			continue;
++
++		last_seq = le64_to_cpu(i->j.last_seq);
++		break;
++	}
++
++	nr = cur_seq - last_seq;
++
++	if (nr + 1 > j->pin.size) {
++		free_fifo(&j->pin);
++		init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
++		if (!j->pin.data) {
++			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
++			return -ENOMEM;
++		}
++	}
++
++	j->replay_journal_seq	= last_seq;
++	j->replay_journal_seq_end = cur_seq;
++	j->last_seq_ondisk	= last_seq;
++	j->flushed_seq_ondisk	= cur_seq - 1;
++	j->seq_ondisk		= cur_seq - 1;
++	j->pin.front		= last_seq;
++	j->pin.back		= cur_seq;
++	atomic64_set(&j->seq, cur_seq - 1);
++
++	fifo_for_each_entry_ptr(p, &j->pin, seq)
++		journal_pin_list_init(p, 1);
++
++	genradix_for_each(&c->journal_entries, iter, _i) {
++		i = *_i;
++
++		if (!i || i->ignore)
++			continue;
++
++		seq = le64_to_cpu(i->j.seq);
++		BUG_ON(seq >= cur_seq);
++
++		if (seq < last_seq)
++			continue;
++
++		if (journal_entry_empty(&i->j))
++			j->last_empty_seq = le64_to_cpu(i->j.seq);
++
++		p = journal_seq_pin(j, seq);
++
++		p->devs.nr = 0;
++		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
++			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
++
++		had_entries = true;
++	}
++
++	if (!had_entries)
++		j->last_empty_seq = cur_seq;
++
++	spin_lock(&j->lock);
++
++	set_bit(JOURNAL_STARTED, &j->flags);
++	j->last_flush_write = jiffies;
++
++	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
++	j->reservations.unwritten_idx++;
++
++	c->last_bucket_seq_cleanup = journal_cur_seq(j);
++
++	bch2_journal_space_available(j);
++	spin_unlock(&j->lock);
++
++	return bch2_journal_reclaim_start(j);
++}
++
++/* init/exit: */
++
++void bch2_dev_journal_exit(struct bch_dev *ca)
++{
++	kfree(ca->journal.bio);
++	kfree(ca->journal.buckets);
++	kfree(ca->journal.bucket_seq);
++
++	ca->journal.bio		= NULL;
++	ca->journal.buckets	= NULL;
++	ca->journal.bucket_seq	= NULL;
++}
++
++int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
++{
++	struct journal_device *ja = &ca->journal;
++	struct bch_sb_field_journal *journal_buckets =
++		bch2_sb_get_journal(sb);
++	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
++		bch2_sb_get_journal_v2(sb);
++	unsigned i;
++
++	ja->nr = 0;
++
++	if (journal_buckets_v2) {
++		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
++
++		for (i = 0; i < nr; i++)
++			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
++	} else if (journal_buckets) {
++		ja->nr = bch2_nr_journal_buckets(journal_buckets);
++	}
++
++	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
++	if (!ja->bucket_seq)
++		return -ENOMEM;
++
++	ca->journal.bio = bio_kmalloc(GFP_KERNEL,
++			DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
++	if (!ca->journal.bio)
++		return -ENOMEM;
++
++	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
++	if (!ja->buckets)
++		return -ENOMEM;
++
++	if (journal_buckets_v2) {
++		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
++		unsigned j, dst = 0;
++
++		for (i = 0; i < nr; i++)
++			for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
++				ja->buckets[dst++] =
++					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
++	} else if (journal_buckets) {
++		for (i = 0; i < ja->nr; i++)
++			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
++	}
++
++	return 0;
++}
++
++void bch2_fs_journal_exit(struct journal *j)
++{
++	unsigned i;
++
++	for (i = 0; i < ARRAY_SIZE(j->buf); i++)
++		kvpfree(j->buf[i].data, j->buf[i].buf_size);
++	free_fifo(&j->pin);
++}
++
++int bch2_fs_journal_init(struct journal *j)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	static struct lock_class_key res_key;
++	unsigned i;
++	int ret = 0;
++
++	pr_verbose_init(c->opts, "");
++
++	spin_lock_init(&j->lock);
++	spin_lock_init(&j->err_lock);
++	init_waitqueue_head(&j->wait);
++	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
++	init_waitqueue_head(&j->reclaim_wait);
++	init_waitqueue_head(&j->pin_flush_wait);
++	mutex_init(&j->reclaim_lock);
++	mutex_init(&j->discard_lock);
++
++	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
++
++	atomic64_set(&j->reservations.counter,
++		((union journal_res_state)
++		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
++
++	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
++		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
++		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
++		if (!j->buf[i].data) {
++			ret = -ENOMEM;
++			goto out;
++		}
++	}
++
++	j->pin.front = j->pin.back = 1;
++out:
++	pr_verbose_init(c->opts, "ret %i", ret);
++	return ret;
++}
++
++/* debug: */
++
++void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	union journal_res_state s;
++	struct bch_dev *ca;
++	unsigned long now = jiffies;
++	u64 seq;
++	unsigned i;
++
++	out->atomic++;
++	out->tabstops[0] = 24;
++
++	rcu_read_lock();
++	s = READ_ONCE(j->reservations);
++
++	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size);
++	prt_printf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
++	prt_printf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
++	prt_printf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
++	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
++	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
++	prt_printf(out, "prereserved:\t\t%u/%u\n",		j->prereserved.reserved, j->prereserved.remaining);
++	prt_printf(out, "watermark:\t\t%s\n",		bch2_journal_watermarks[j->watermark]);
++	prt_printf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
++	prt_printf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
++	prt_printf(out, "nr noflush writes:\t%llu\n",	j->nr_noflush_writes);
++	prt_printf(out, "nr direct reclaim:\t%llu\n",	j->nr_direct_reclaim);
++	prt_printf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
++	prt_printf(out, "reclaim kicked:\t\t%u\n",		j->reclaim_kicked);
++	prt_printf(out, "reclaim runs in:\t%u ms\n",	time_after(j->next_reclaim, now)
++	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
++	prt_printf(out, "current entry sectors:\t%u\n",	j->cur_entry_sectors);
++	prt_printf(out, "current entry error:\t%s\n",	bch2_journal_errors[j->cur_entry_error]);
++	prt_printf(out, "current entry:\t\t");
++
++	switch (s.cur_entry_offset) {
++	case JOURNAL_ENTRY_ERROR_VAL:
++		prt_printf(out, "error");
++		break;
++	case JOURNAL_ENTRY_CLOSED_VAL:
++		prt_printf(out, "closed");
++		break;
++	default:
++		prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
++		break;
++	}
++
++	prt_newline(out);
++
++	for (seq = journal_cur_seq(j);
++	     seq >= journal_last_unwritten_seq(j);
++	     --seq) {
++		i = seq & JOURNAL_BUF_MASK;
++
++		prt_printf(out, "unwritten entry:");
++		prt_tab(out);
++		prt_printf(out, "%llu", seq);
++		prt_newline(out);
++		printbuf_indent_add(out, 2);
++
++		prt_printf(out, "refcount:");
++		prt_tab(out);
++		prt_printf(out, "%u", journal_state_count(s, i));
++		prt_newline(out);
++
++		prt_printf(out, "sectors:");
++		prt_tab(out);
++		prt_printf(out, "%u", j->buf[i].sectors);
++		prt_newline(out);
++
++		prt_printf(out, "expires");
++		prt_tab(out);
++		prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
++		prt_newline(out);
++
++		printbuf_indent_sub(out, 2);
++	}
++
++	prt_printf(out,
++	       "replay done:\t\t%i\n",
++	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
++
++	prt_printf(out, "space:\n");
++	prt_printf(out, "\tdiscarded\t%u:%u\n",
++	       j->space[journal_space_discarded].next_entry,
++	       j->space[journal_space_discarded].total);
++	prt_printf(out, "\tclean ondisk\t%u:%u\n",
++	       j->space[journal_space_clean_ondisk].next_entry,
++	       j->space[journal_space_clean_ondisk].total);
++	prt_printf(out, "\tclean\t\t%u:%u\n",
++	       j->space[journal_space_clean].next_entry,
++	       j->space[journal_space_clean].total);
++	prt_printf(out, "\ttotal\t\t%u:%u\n",
++	       j->space[journal_space_total].next_entry,
++	       j->space[journal_space_total].total);
++
++	for_each_member_device_rcu(ca, c, i,
++				   &c->rw_devs[BCH_DATA_journal]) {
++		struct journal_device *ja = &ca->journal;
++
++		if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
++			continue;
++
++		if (!ja->nr)
++			continue;
++
++		prt_printf(out, "dev %u:\n",		i);
++		prt_printf(out, "\tnr\t\t%u\n",		ja->nr);
++		prt_printf(out, "\tbucket size\t%u\n",	ca->mi.bucket_size);
++		prt_printf(out, "\tavailable\t%u:%u\n",	bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
++		prt_printf(out, "\tdiscard_idx\t%u\n",	ja->discard_idx);
++		prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
++		prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
++		prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
++	}
++
++	rcu_read_unlock();
++
++	--out->atomic;
++}
++
++void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
++{
++	spin_lock(&j->lock);
++	__bch2_journal_debug_to_text(out, j);
++	spin_unlock(&j->lock);
++}
++
++bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
++{
++	struct journal_entry_pin_list *pin_list;
++	struct journal_entry_pin *pin;
++
++	spin_lock(&j->lock);
++	*seq = max(*seq, j->pin.front);
++
++	if (*seq >= j->pin.back) {
++		spin_unlock(&j->lock);
++		return true;
++	}
++
++	out->atomic++;
++
++	pin_list = journal_seq_pin(j, *seq);
++
++	prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
++	prt_newline(out);
++	printbuf_indent_add(out, 2);
++
++	list_for_each_entry(pin, &pin_list->list, list) {
++		prt_printf(out, "\t%px %ps", pin, pin->flush);
++		prt_newline(out);
++	}
++
++	list_for_each_entry(pin, &pin_list->key_cache_list, list) {
++		prt_printf(out, "\t%px %ps", pin, pin->flush);
++		prt_newline(out);
++	}
++
++	if (!list_empty(&pin_list->flushed)) {
++		prt_printf(out, "flushed:");
++		prt_newline(out);
++	}
++
++	list_for_each_entry(pin, &pin_list->flushed, list) {
++		prt_printf(out, "\t%px %ps", pin, pin->flush);
++		prt_newline(out);
++	}
++
++	printbuf_indent_sub(out, 2);
++
++	--out->atomic;
++	spin_unlock(&j->lock);
++
++	return false;
++}
++
++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
++{
++	u64 seq = 0;
++
++	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
++		seq++;
++}
+diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
+new file mode 100644
+index 000000000000..d3caa7ea7ce9
+--- /dev/null
++++ b/fs/bcachefs/journal.h
+@@ -0,0 +1,521 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_JOURNAL_H
++#define _BCACHEFS_JOURNAL_H
++
++/*
++ * THE JOURNAL:
++ *
++ * The primary purpose of the journal is to log updates (insertions) to the
++ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
++ *
++ * Without the journal, the b-tree is always internally consistent on
++ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
++ * but did handle unclean shutdowns by doing all index updates synchronously
++ * (with coalescing).
++ *
++ * Updates to interior nodes still happen synchronously and without the journal
++ * (for simplicity) - this may change eventually but updates to interior nodes
++ * are rare enough it's not a huge priority.
++ *
++ * This means the journal is relatively separate from the b-tree; it consists of
++ * just a list of keys and journal replay consists of just redoing those
++ * insertions in same order that they appear in the journal.
++ *
++ * PERSISTENCE:
++ *
++ * For synchronous updates (where we're waiting on the index update to hit
++ * disk), the journal entry will be written out immediately (or as soon as
++ * possible, if the write for the previous journal entry was still in flight).
++ *
++ * Synchronous updates are specified by passing a closure (@flush_cl) to
++ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
++ * down to the journalling code. That closure will will wait on the journal
++ * write to complete (via closure_wait()).
++ *
++ * If the index update wasn't synchronous, the journal entry will be
++ * written out after 10 ms have elapsed, by default (the delay_ms field
++ * in struct journal).
++ *
++ * JOURNAL ENTRIES:
++ *
++ * A journal entry is variable size (struct jset), it's got a fixed length
++ * header and then a variable number of struct jset_entry entries.
++ *
++ * Journal entries are identified by monotonically increasing 64 bit sequence
++ * numbers - jset->seq; other places in the code refer to this sequence number.
++ *
++ * A jset_entry entry contains one or more bkeys (which is what gets inserted
++ * into the b-tree). We need a container to indicate which b-tree the key is
++ * for; also, the roots of the various b-trees are stored in jset_entry entries
++ * (one for each b-tree) - this lets us add new b-tree types without changing
++ * the on disk format.
++ *
++ * We also keep some things in the journal header that are logically part of the
++ * superblock - all the things that are frequently updated. This is for future
++ * bcache on raw flash support; the superblock (which will become another
++ * journal) can't be moved or wear leveled, so it contains just enough
++ * information to find the main journal, and the superblock only has to be
++ * rewritten when we want to move/wear level the main journal.
++ *
++ * JOURNAL LAYOUT ON DISK:
++ *
++ * The journal is written to a ringbuffer of buckets (which is kept in the
++ * superblock); the individual buckets are not necessarily contiguous on disk
++ * which means that journal entries are not allowed to span buckets, but also
++ * that we can resize the journal at runtime if desired (unimplemented).
++ *
++ * The journal buckets exist in the same pool as all the other buckets that are
++ * managed by the allocator and garbage collection - garbage collection marks
++ * the journal buckets as metadata buckets.
++ *
++ * OPEN/DIRTY JOURNAL ENTRIES:
++ *
++ * Open/dirty journal entries are journal entries that contain b-tree updates
++ * that have not yet been written out to the b-tree on disk. We have to track
++ * which journal entries are dirty, and we also have to avoid wrapping around
++ * the journal and overwriting old but still dirty journal entries with new
++ * journal entries.
++ *
++ * On disk, this is represented with the "last_seq" field of struct jset;
++ * last_seq is the first sequence number that journal replay has to replay.
++ *
++ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
++ * journal_device->seq) of for each journal bucket, the highest sequence number
++ * any journal entry it contains. Then, by comparing that against last_seq we
++ * can determine whether that journal bucket contains dirty journal entries or
++ * not.
++ *
++ * To track which journal entries are dirty, we maintain a fifo of refcounts
++ * (where each entry corresponds to a specific sequence number) - when a ref
++ * goes to 0, that journal entry is no longer dirty.
++ *
++ * Journalling of index updates is done at the same time as the b-tree itself is
++ * being modified (see btree_insert_key()); when we add the key to the journal
++ * the pending b-tree write takes a ref on the journal entry the key was added
++ * to. If a pending b-tree write would need to take refs on multiple dirty
++ * journal entries, it only keeps the ref on the oldest one (since a newer
++ * journal entry will still be replayed if an older entry was dirty).
++ *
++ * JOURNAL FILLING UP:
++ *
++ * There are two ways the journal could fill up; either we could run out of
++ * space to write to, or we could have too many open journal entries and run out
++ * of room in the fifo of refcounts. Since those refcounts are decremented
++ * without any locking we can't safely resize that fifo, so we handle it the
++ * same way.
++ *
++ * If the journal fills up, we start flushing dirty btree nodes until we can
++ * allocate space for a journal write again - preferentially flushing btree
++ * nodes that are pinning the oldest journal entries first.
++ */
++
++#include <linux/hash.h>
++
++#include "journal_types.h"
++
++struct bch_fs;
++
++static inline void journal_wake(struct journal *j)
++{
++	wake_up(&j->wait);
++	closure_wake_up(&j->async_wait);
++	closure_wake_up(&j->preres_wait);
++}
++
++static inline struct journal_buf *journal_cur_buf(struct journal *j)
++{
++	return j->buf + j->reservations.idx;
++}
++
++/* Sequence number of oldest dirty journal entry */
++
++static inline u64 journal_last_seq(struct journal *j)
++{
++	return j->pin.front;
++}
++
++static inline u64 journal_cur_seq(struct journal *j)
++{
++	EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
++
++	return j->pin.back - 1;
++}
++
++static inline u64 journal_last_unwritten_seq(struct journal *j)
++{
++	return j->seq_ondisk + 1;
++}
++
++static inline int journal_state_count(union journal_res_state s, int idx)
++{
++	switch (idx) {
++	case 0: return s.buf0_count;
++	case 1: return s.buf1_count;
++	case 2: return s.buf2_count;
++	case 3: return s.buf3_count;
++	}
++	BUG();
++}
++
++static inline void journal_state_inc(union journal_res_state *s)
++{
++	s->buf0_count += s->idx == 0;
++	s->buf1_count += s->idx == 1;
++	s->buf2_count += s->idx == 2;
++	s->buf3_count += s->idx == 3;
++}
++
++/*
++ * Amount of space that will be taken up by some keys in the journal (i.e.
++ * including the jset header)
++ */
++static inline unsigned jset_u64s(unsigned u64s)
++{
++	return u64s + sizeof(struct jset_entry) / sizeof(u64);
++}
++
++static inline int journal_entry_overhead(struct journal *j)
++{
++	return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
++}
++
++static inline struct jset_entry *
++bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
++{
++	struct jset *jset = buf->data;
++	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
++
++	memset(entry, 0, sizeof(*entry));
++	entry->u64s = cpu_to_le16(u64s);
++
++	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
++
++	return entry;
++}
++
++static inline struct jset_entry *
++journal_res_entry(struct journal *j, struct journal_res *res)
++{
++	return vstruct_idx(j->buf[res->idx].data, res->offset);
++}
++
++static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
++					  enum btree_id id, unsigned level,
++					  unsigned u64s)
++{
++	entry->u64s	= cpu_to_le16(u64s);
++	entry->btree_id = id;
++	entry->level	= level;
++	entry->type	= type;
++	entry->pad[0]	= 0;
++	entry->pad[1]	= 0;
++	entry->pad[2]	= 0;
++	return jset_u64s(u64s);
++}
++
++static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
++					  enum btree_id id, unsigned level,
++					  const void *data, unsigned u64s)
++{
++	unsigned ret = journal_entry_init(entry, type, id, level, u64s);
++
++	memcpy_u64s_small(entry->_data, data, u64s);
++	return ret;
++}
++
++static inline struct jset_entry *
++bch2_journal_add_entry(struct journal *j, struct journal_res *res,
++			 unsigned type, enum btree_id id,
++			 unsigned level, unsigned u64s)
++{
++	struct jset_entry *entry = journal_res_entry(j, res);
++	unsigned actual = journal_entry_init(entry, type, id, level, u64s);
++
++	EBUG_ON(!res->ref);
++	EBUG_ON(actual > res->u64s);
++
++	res->offset	+= actual;
++	res->u64s	-= actual;
++	return entry;
++}
++
++static inline bool journal_entry_empty(struct jset *j)
++{
++	struct jset_entry *i;
++
++	if (j->seq != j->last_seq)
++		return false;
++
++	vstruct_for_each(j, i)
++		if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
++			return false;
++	return true;
++}
++
++void __bch2_journal_buf_put(struct journal *);
++
++static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
++{
++	union journal_res_state s;
++
++	s.v = atomic64_sub_return(((union journal_res_state) {
++				    .buf0_count = idx == 0,
++				    .buf1_count = idx == 1,
++				    .buf2_count = idx == 2,
++				    .buf3_count = idx == 3,
++				    }).v, &j->reservations.counter);
++
++	if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
++		__bch2_journal_buf_put(j);
++}
++
++/*
++ * This function releases the journal write structure so other threads can
++ * then proceed to add their keys as well.
++ */
++static inline void bch2_journal_res_put(struct journal *j,
++				       struct journal_res *res)
++{
++	if (!res->ref)
++		return;
++
++	lock_release(&j->res_map, _THIS_IP_);
++
++	while (res->u64s)
++		bch2_journal_add_entry(j, res,
++				       BCH_JSET_ENTRY_btree_keys,
++				       0, 0, 0);
++
++	bch2_journal_buf_put(j, res->idx);
++
++	res->ref = 0;
++}
++
++int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
++				  unsigned);
++
++/* First two bits for JOURNAL_WATERMARK: */
++#define JOURNAL_RES_GET_NONBLOCK	(1 << 2)
++#define JOURNAL_RES_GET_CHECK		(1 << 3)
++
++static inline int journal_res_get_fast(struct journal *j,
++				       struct journal_res *res,
++				       unsigned flags)
++{
++	union journal_res_state old, new;
++	u64 v = atomic64_read(&j->reservations.counter);
++
++	do {
++		old.v = new.v = v;
++
++		/*
++		 * Check if there is still room in the current journal
++		 * entry:
++		 */
++		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
++			return 0;
++
++		EBUG_ON(!journal_state_count(new, new.idx));
++
++		if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
++			return 0;
++
++		new.cur_entry_offset += res->u64s;
++		journal_state_inc(&new);
++
++		/*
++		 * If the refcount would overflow, we have to wait:
++		 * XXX - tracepoint this:
++		 */
++		if (!journal_state_count(new, new.idx))
++			return 0;
++
++		if (flags & JOURNAL_RES_GET_CHECK)
++			return 1;
++	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
++				       old.v, new.v)) != old.v);
++
++	res->ref	= true;
++	res->idx	= old.idx;
++	res->offset	= old.cur_entry_offset;
++	res->seq	= le64_to_cpu(j->buf[old.idx].data->seq);
++	return 1;
++}
++
++static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
++				       unsigned u64s, unsigned flags)
++{
++	int ret;
++
++	EBUG_ON(res->ref);
++	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
++
++	res->u64s = u64s;
++
++	if (journal_res_get_fast(j, res, flags))
++		goto out;
++
++	ret = bch2_journal_res_get_slowpath(j, res, flags);
++	if (ret)
++		return ret;
++out:
++	if (!(flags & JOURNAL_RES_GET_CHECK)) {
++		lock_acquire_shared(&j->res_map, 0,
++				    (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
++				    NULL, _THIS_IP_);
++		EBUG_ON(!res->ref);
++	}
++	return 0;
++}
++
++/* journal_preres: */
++
++static inline void journal_set_watermark(struct journal *j)
++{
++	union journal_preres_state s = READ_ONCE(j->prereserved);
++	unsigned watermark = JOURNAL_WATERMARK_any;
++
++	if (fifo_free(&j->pin) < j->pin.size / 4)
++		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
++	if (fifo_free(&j->pin) < j->pin.size / 8)
++		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
++
++	if (s.reserved > s.remaining)
++		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
++	if (!s.remaining)
++		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
++
++	if (watermark == j->watermark)
++		return;
++
++	swap(watermark, j->watermark);
++	if (watermark > j->watermark)
++		journal_wake(j);
++}
++
++static inline void bch2_journal_preres_put(struct journal *j,
++					   struct journal_preres *res)
++{
++	union journal_preres_state s = { .reserved = res->u64s };
++
++	if (!res->u64s)
++		return;
++
++	s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
++	res->u64s = 0;
++
++	if (unlikely(s.waiting)) {
++		clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
++			  (unsigned long *) &j->prereserved.v);
++		closure_wake_up(&j->preres_wait);
++	}
++
++	if (s.reserved <= s.remaining && j->watermark)
++		journal_set_watermark(j);
++}
++
++int __bch2_journal_preres_get(struct journal *,
++			struct journal_preres *, unsigned, unsigned);
++
++static inline int bch2_journal_preres_get_fast(struct journal *j,
++					       struct journal_preres *res,
++					       unsigned new_u64s,
++					       unsigned flags,
++					       bool set_waiting)
++{
++	int d = new_u64s - res->u64s;
++	union journal_preres_state old, new;
++	u64 v = atomic64_read(&j->prereserved.counter);
++	int ret;
++
++	do {
++		old.v = new.v = v;
++		ret = 0;
++
++		if ((flags & JOURNAL_WATERMARK_reserved) ||
++		    new.reserved + d < new.remaining) {
++			new.reserved += d;
++			ret = 1;
++		} else if (set_waiting && !new.waiting)
++			new.waiting = true;
++		else
++			return 0;
++	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
++				       old.v, new.v)) != old.v);
++
++	if (ret)
++		res->u64s += d;
++	return ret;
++}
++
++static inline int bch2_journal_preres_get(struct journal *j,
++					  struct journal_preres *res,
++					  unsigned new_u64s,
++					  unsigned flags)
++{
++	if (new_u64s <= res->u64s)
++		return 0;
++
++	if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
++		return 0;
++
++	if (flags & JOURNAL_RES_GET_NONBLOCK)
++		return -EAGAIN;
++
++	return __bch2_journal_preres_get(j, res, new_u64s, flags);
++}
++
++/* journal_entry_res: */
++
++void bch2_journal_entry_res_resize(struct journal *,
++				   struct journal_entry_res *,
++				   unsigned);
++
++int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
++void bch2_journal_flush_async(struct journal *, struct closure *);
++
++int bch2_journal_flush_seq(struct journal *, u64);
++int bch2_journal_flush(struct journal *);
++bool bch2_journal_noflush_seq(struct journal *, u64);
++int bch2_journal_meta(struct journal *);
++int bch2_journal_log_msg(struct journal *, const char *, ...);
++
++void bch2_journal_halt(struct journal *);
++
++static inline int bch2_journal_error(struct journal *j)
++{
++	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
++		? -EIO : 0;
++}
++
++struct bch_dev;
++
++static inline void bch2_journal_set_replay_done(struct journal *j)
++{
++	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
++	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
++}
++
++void bch2_journal_unblock(struct journal *);
++void bch2_journal_block(struct journal *);
++
++void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
++void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
++void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
++bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
++
++int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
++				unsigned nr);
++int bch2_dev_journal_alloc(struct bch_dev *);
++
++void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
++
++void bch2_fs_journal_stop(struct journal *);
++int bch2_fs_journal_start(struct journal *, u64);
++
++void bch2_dev_journal_exit(struct bch_dev *);
++int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
++void bch2_fs_journal_exit(struct journal *);
++int bch2_fs_journal_init(struct journal *);
++
++#endif /* _BCACHEFS_JOURNAL_H */
+diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
+new file mode 100644
+index 000000000000..6fa2c54c1af4
+--- /dev/null
++++ b/fs/bcachefs/journal_io.c
+@@ -0,0 +1,1735 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "alloc_foreground.h"
++#include "btree_io.h"
++#include "btree_update_interior.h"
++#include "buckets.h"
++#include "checksum.h"
++#include "disk_groups.h"
++#include "error.h"
++#include "io.h"
++#include "journal.h"
++#include "journal_io.h"
++#include "journal_reclaim.h"
++#include "journal_seq_blacklist.h"
++#include "replicas.h"
++
++#include <trace/events/bcachefs.h>
++
++static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
++{
++	return (seq - c->journal_entries_base_seq) & (~0U >> 1);
++}
++
++static void __journal_replay_free(struct bch_fs *c,
++				  struct journal_replay *i)
++{
++	struct journal_replay **p =
++		genradix_ptr(&c->journal_entries,
++			     journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
++
++	BUG_ON(*p != i);
++	*p = NULL;
++	kvpfree(i, offsetof(struct journal_replay, j) +
++		vstruct_bytes(&i->j));
++}
++
++static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
++{
++	i->ignore = true;
++
++	if (!c->opts.read_entire_journal)
++		__journal_replay_free(c, i);
++}
++
++struct journal_list {
++	struct closure		cl;
++	u64			last_seq;
++	struct mutex		lock;
++	int			ret;
++};
++
++#define JOURNAL_ENTRY_ADD_OK		0
++#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
++
++/*
++ * Given a journal entry we just read, add it to the list of journal entries to
++ * be replayed:
++ */
++static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
++			     struct journal_ptr entry_ptr,
++			     struct journal_list *jlist, struct jset *j,
++			     bool bad)
++{
++	struct genradix_iter iter;
++	struct journal_replay **_i, *i, *dup;
++	struct journal_ptr *ptr;
++	size_t bytes = vstruct_bytes(j);
++	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
++	int ret = JOURNAL_ENTRY_ADD_OK;
++
++	/* Is this entry older than the range we need? */
++	if (!c->opts.read_entire_journal &&
++	    le64_to_cpu(j->seq) < jlist->last_seq)
++		return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
++
++	/*
++	 * genradixes are indexed by a ulong, not a u64, so we can't index them
++	 * by sequence number directly: Assume instead that they will all fall
++	 * within the range of +-2billion of the filrst one we find.
++	 */
++	if (!c->journal_entries_base_seq)
++		c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
++
++	/* Drop entries we don't need anymore */
++	if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
++		genradix_for_each_from(&c->journal_entries, iter, _i,
++				       journal_entry_radix_idx(c, jlist->last_seq)) {
++			i = *_i;
++
++			if (!i || i->ignore)
++				continue;
++
++			if (le64_to_cpu(i->j.seq) >= last_seq)
++				break;
++			journal_replay_free(c, i);
++		}
++	}
++
++	jlist->last_seq = max(jlist->last_seq, last_seq);
++
++	_i = genradix_ptr_alloc(&c->journal_entries,
++				journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
++				GFP_KERNEL);
++	if (!_i)
++		return -ENOMEM;
++
++	/*
++	 * Duplicate journal entries? If so we want the one that didn't have a
++	 * checksum error:
++	 */
++	dup = *_i;
++	if (dup) {
++		if (dup->bad) {
++			/* we'll replace @dup: */
++		} else if (bad) {
++			i = dup;
++			goto found;
++		} else {
++			fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
++				    memcmp(j, &dup->j, bytes), c,
++				    "found duplicate but non identical journal entries (seq %llu)",
++				    le64_to_cpu(j->seq));
++			i = dup;
++			goto found;
++		}
++	}
++
++	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
++	if (!i)
++		return -ENOMEM;
++
++	i->nr_ptrs	 = 0;
++	i->bad		= bad;
++	i->ignore	= false;
++	memcpy(&i->j, j, bytes);
++
++	if (dup) {
++		i->nr_ptrs = dup->nr_ptrs;
++		memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
++		__journal_replay_free(c, dup);
++	}
++
++
++	*_i = i;
++found:
++	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
++		if (ptr->dev == ca->dev_idx) {
++			bch_err(c, "duplicate journal entry %llu on same device",
++				le64_to_cpu(i->j.seq));
++			goto out;
++		}
++	}
++
++	if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
++		bch_err(c, "found too many copies of journal entry %llu",
++			le64_to_cpu(i->j.seq));
++		goto out;
++	}
++
++	i->ptrs[i->nr_ptrs++] = entry_ptr;
++out:
++fsck_err:
++	return ret;
++}
++
++static struct nonce journal_nonce(const struct jset *jset)
++{
++	return (struct nonce) {{
++		[0] = 0,
++		[1] = ((__le32 *) &jset->seq)[0],
++		[2] = ((__le32 *) &jset->seq)[1],
++		[3] = BCH_NONCE_JOURNAL,
++	}};
++}
++
++/* this fills in a range with empty jset_entries: */
++static void journal_entry_null_range(void *start, void *end)
++{
++	struct jset_entry *entry;
++
++	for (entry = start; entry != end; entry = vstruct_next(entry))
++		memset(entry, 0, sizeof(*entry));
++}
++
++#define JOURNAL_ENTRY_REREAD	5
++#define JOURNAL_ENTRY_NONE	6
++#define JOURNAL_ENTRY_BAD	7
++
++#define journal_entry_err(c, msg, ...)					\
++({									\
++	switch (write) {						\
++	case READ:							\
++		mustfix_fsck_err(c, msg, ##__VA_ARGS__);		\
++		break;							\
++	case WRITE:							\
++		bch_err(c, "corrupt metadata before write:\n"		\
++			msg, ##__VA_ARGS__);				\
++		if (bch2_fs_inconsistent(c)) {				\
++			ret = -BCH_ERR_fsck_errors_not_fixed;		\
++			goto fsck_err;					\
++		}							\
++		break;							\
++	}								\
++	true;								\
++})
++
++#define journal_entry_err_on(cond, c, msg, ...)				\
++	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
++
++#define FSCK_DELETED_KEY	5
++
++static int journal_validate_key(struct bch_fs *c, const char *where,
++				struct jset_entry *entry,
++				unsigned level, enum btree_id btree_id,
++				struct bkey_i *k,
++				unsigned version, int big_endian, int write)
++{
++	void *next = vstruct_next(entry);
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	if (journal_entry_err_on(!k->k.u64s, c,
++			"invalid key in %s at %s offset %zi/%u: k->u64s 0",
++			bch2_jset_entry_types[entry->type], where,
++			(u64 *) k - entry->_data,
++			le16_to_cpu(entry->u64s))) {
++		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
++		journal_entry_null_range(vstruct_next(entry), next);
++		return FSCK_DELETED_KEY;
++	}
++
++	if (journal_entry_err_on((void *) bkey_next(k) >
++				(void *) vstruct_next(entry), c,
++			"invalid key in %s at %s offset %zi/%u: extends past end of journal entry",
++			bch2_jset_entry_types[entry->type], where,
++			(u64 *) k - entry->_data,
++			le16_to_cpu(entry->u64s))) {
++		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
++		journal_entry_null_range(vstruct_next(entry), next);
++		return FSCK_DELETED_KEY;
++	}
++
++	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
++			"invalid key in %s at %s offset %zi/%u: bad format %u",
++			bch2_jset_entry_types[entry->type], where,
++			(u64 *) k - entry->_data,
++			le16_to_cpu(entry->u64s),
++			k->k.format)) {
++		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
++		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
++		journal_entry_null_range(vstruct_next(entry), next);
++		return FSCK_DELETED_KEY;
++	}
++
++	if (!write)
++		bch2_bkey_compat(level, btree_id, version, big_endian,
++				 write, NULL, bkey_to_packed(k));
++
++	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
++			      __btree_node_type(level, btree_id), write, &buf)) {
++		printbuf_reset(&buf);
++		prt_printf(&buf, "invalid key in %s at %s offset %zi/%u:",
++		       bch2_jset_entry_types[entry->type], where,
++		       (u64 *) k - entry->_data,
++		       le16_to_cpu(entry->u64s));
++		prt_newline(&buf);
++		printbuf_indent_add(&buf, 2);
++
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
++		prt_newline(&buf);
++		bch2_bkey_invalid(c, bkey_i_to_s_c(k),
++				  __btree_node_type(level, btree_id), write, &buf);
++
++		mustfix_fsck_err(c, "%s", buf.buf);
++
++		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
++		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
++		journal_entry_null_range(vstruct_next(entry), next);
++
++		printbuf_exit(&buf);
++		return FSCK_DELETED_KEY;
++	}
++
++	if (write)
++		bch2_bkey_compat(level, btree_id, version, big_endian,
++				 write, NULL, bkey_to_packed(k));
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int journal_entry_btree_keys_validate(struct bch_fs *c,
++					     const char *where,
++					     struct jset_entry *entry,
++					     unsigned version, int big_endian, int write)
++{
++	struct bkey_i *k = entry->start;
++
++	while (k != vstruct_last(entry)) {
++		int ret = journal_validate_key(c, where, entry,
++					       entry->level,
++					       entry->btree_id,
++					       k, version, big_endian, write);
++		if (ret == FSCK_DELETED_KEY)
++			continue;
++
++		k = bkey_next(k);
++	}
++
++	return 0;
++}
++
++static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
++					     struct jset_entry *entry)
++{
++	struct bkey_i *k;
++	bool first = true;
++
++	vstruct_for_each(entry, k) {
++		if (!first) {
++			prt_newline(out);
++			prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
++		}
++		prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
++		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
++		first = false;
++	}
++}
++
++static int journal_entry_btree_root_validate(struct bch_fs *c,
++					     const char *where,
++					     struct jset_entry *entry,
++					     unsigned version, int big_endian, int write)
++{
++	struct bkey_i *k = entry->start;
++	int ret = 0;
++
++	if (journal_entry_err_on(!entry->u64s ||
++				 le16_to_cpu(entry->u64s) != k->k.u64s, c,
++				 "invalid btree root journal entry: wrong number of keys")) {
++		void *next = vstruct_next(entry);
++		/*
++		 * we don't want to null out this jset_entry,
++		 * just the contents, so that later we can tell
++		 * we were _supposed_ to have a btree root
++		 */
++		entry->u64s = 0;
++		journal_entry_null_range(vstruct_next(entry), next);
++		return 0;
++	}
++
++	return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
++				    version, big_endian, write);
++fsck_err:
++	return ret;
++}
++
++static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
++					     struct jset_entry *entry)
++{
++	journal_entry_btree_keys_to_text(out, c, entry);
++}
++
++static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
++					    const char *where,
++					    struct jset_entry *entry,
++					    unsigned version, int big_endian, int write)
++{
++	/* obsolete, don't care: */
++	return 0;
++}
++
++static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
++					    struct jset_entry *entry)
++{
++}
++
++static int journal_entry_blacklist_validate(struct bch_fs *c,
++					    const char *where,
++					    struct jset_entry *entry,
++					    unsigned version, int big_endian, int write)
++{
++	int ret = 0;
++
++	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
++		"invalid journal seq blacklist entry: bad size")) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++	}
++fsck_err:
++	return ret;
++}
++
++static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
++					    struct jset_entry *entry)
++{
++	struct jset_entry_blacklist *bl =
++		container_of(entry, struct jset_entry_blacklist, entry);
++
++	prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
++}
++
++static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
++					       const char *where,
++					       struct jset_entry *entry,
++					       unsigned version, int big_endian, int write)
++{
++	struct jset_entry_blacklist_v2 *bl_entry;
++	int ret = 0;
++
++	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
++		"invalid journal seq blacklist entry: bad size")) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++		goto out;
++	}
++
++	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
++
++	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
++				 le64_to_cpu(bl_entry->end), c,
++		"invalid journal seq blacklist entry: start > end")) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++	}
++out:
++fsck_err:
++	return ret;
++}
++
++static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
++					       struct jset_entry *entry)
++{
++	struct jset_entry_blacklist_v2 *bl =
++		container_of(entry, struct jset_entry_blacklist_v2, entry);
++
++	prt_printf(out, "start=%llu end=%llu",
++	       le64_to_cpu(bl->start),
++	       le64_to_cpu(bl->end));
++}
++
++static int journal_entry_usage_validate(struct bch_fs *c,
++					const char *where,
++					struct jset_entry *entry,
++					unsigned version, int big_endian, int write)
++{
++	struct jset_entry_usage *u =
++		container_of(entry, struct jset_entry_usage, entry);
++	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
++	int ret = 0;
++
++	if (journal_entry_err_on(bytes < sizeof(*u),
++				 c,
++				 "invalid journal entry usage: bad size")) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++		return ret;
++	}
++
++fsck_err:
++	return ret;
++}
++
++static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
++					struct jset_entry *entry)
++{
++	struct jset_entry_usage *u =
++		container_of(entry, struct jset_entry_usage, entry);
++
++	prt_printf(out, "type=%s v=%llu",
++	       bch2_fs_usage_types[u->entry.btree_id],
++	       le64_to_cpu(u->v));
++}
++
++static int journal_entry_data_usage_validate(struct bch_fs *c,
++					const char *where,
++					struct jset_entry *entry,
++					unsigned version, int big_endian, int write)
++{
++	struct jset_entry_data_usage *u =
++		container_of(entry, struct jset_entry_data_usage, entry);
++	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
++	int ret = 0;
++
++	if (journal_entry_err_on(bytes < sizeof(*u) ||
++				 bytes < sizeof(*u) + u->r.nr_devs,
++				 c,
++				 "invalid journal entry usage: bad size")) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++		return ret;
++	}
++
++fsck_err:
++	return ret;
++}
++
++static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
++					     struct jset_entry *entry)
++{
++	struct jset_entry_data_usage *u =
++		container_of(entry, struct jset_entry_data_usage, entry);
++
++	bch2_replicas_entry_to_text(out, &u->r);
++	prt_printf(out, "=%llu", le64_to_cpu(u->v));
++}
++
++static int journal_entry_clock_validate(struct bch_fs *c,
++					const char *where,
++					struct jset_entry *entry,
++					unsigned version, int big_endian, int write)
++{
++	struct jset_entry_clock *clock =
++		container_of(entry, struct jset_entry_clock, entry);
++	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
++	int ret = 0;
++
++	if (journal_entry_err_on(bytes != sizeof(*clock),
++				 c, "invalid journal entry clock: bad size")) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++		return ret;
++	}
++
++	if (journal_entry_err_on(clock->rw > 1,
++				 c, "invalid journal entry clock: bad rw")) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++		return ret;
++	}
++
++fsck_err:
++	return ret;
++}
++
++static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
++					struct jset_entry *entry)
++{
++	struct jset_entry_clock *clock =
++		container_of(entry, struct jset_entry_clock, entry);
++
++	prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
++}
++
++static int journal_entry_dev_usage_validate(struct bch_fs *c,
++					    const char *where,
++					    struct jset_entry *entry,
++					    unsigned version, int big_endian, int write)
++{
++	struct jset_entry_dev_usage *u =
++		container_of(entry, struct jset_entry_dev_usage, entry);
++	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
++	unsigned expected = sizeof(*u);
++	unsigned dev;
++	int ret = 0;
++
++	if (journal_entry_err_on(bytes < expected,
++				 c, "invalid journal entry dev usage: bad size (%u < %u)",
++				 bytes, expected)) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++		return ret;
++	}
++
++	dev = le32_to_cpu(u->dev);
++
++	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
++				 c, "invalid journal entry dev usage: bad dev")) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++		return ret;
++	}
++
++	if (journal_entry_err_on(u->pad,
++				 c, "invalid journal entry dev usage: bad pad")) {
++		journal_entry_null_range(entry, vstruct_next(entry));
++		return ret;
++	}
++
++fsck_err:
++	return ret;
++}
++
++static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
++					    struct jset_entry *entry)
++{
++	struct jset_entry_dev_usage *u =
++		container_of(entry, struct jset_entry_dev_usage, entry);
++	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
++
++	prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
++
++	for (i = 0; i < nr_types; i++) {
++		if (i < BCH_DATA_NR)
++			prt_printf(out, " %s", bch2_data_types[i]);
++		else
++			prt_printf(out, " (unknown data type %u)", i);
++		prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
++		       le64_to_cpu(u->d[i].buckets),
++		       le64_to_cpu(u->d[i].sectors),
++		       le64_to_cpu(u->d[i].fragmented));
++	}
++
++	prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
++}
++
++static int journal_entry_log_validate(struct bch_fs *c,
++				      const char *where,
++				      struct jset_entry *entry,
++				      unsigned version, int big_endian, int write)
++{
++	return 0;
++}
++
++static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
++				      struct jset_entry *entry)
++{
++	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
++	unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
++
++	prt_printf(out, "%.*s", bytes, l->d);
++}
++
++static int journal_entry_overwrite_validate(struct bch_fs *c, const char *where,
++				      struct jset_entry *entry,
++				      unsigned version, int big_endian, int write)
++{
++	return journal_entry_btree_keys_validate(c, where, entry, version, big_endian, write);
++}
++
++static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
++					    struct jset_entry *entry)
++{
++	journal_entry_btree_keys_to_text(out, c, entry);
++}
++
++struct jset_entry_ops {
++	int (*validate)(struct bch_fs *, const char *,
++			struct jset_entry *, unsigned, int, int);
++	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
++};
++
++static const struct jset_entry_ops bch2_jset_entry_ops[] = {
++#define x(f, nr)						\
++	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
++		.validate	= journal_entry_##f##_validate,	\
++		.to_text	= journal_entry_##f##_to_text,	\
++	},
++	BCH_JSET_ENTRY_TYPES()
++#undef x
++};
++
++int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
++				struct jset_entry *entry,
++				unsigned version, int big_endian, int write)
++{
++	return entry->type < BCH_JSET_ENTRY_NR
++		? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
++				version, big_endian, write)
++		: 0;
++}
++
++void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
++				struct jset_entry *entry)
++{
++	if (entry->type < BCH_JSET_ENTRY_NR) {
++		prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
++		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
++	} else {
++		prt_printf(out, "(unknown type %u)", entry->type);
++	}
++}
++
++static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
++				 int write)
++{
++	char buf[100];
++	struct jset_entry *entry;
++	int ret = 0;
++
++	vstruct_for_each(jset, entry) {
++		scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
++			  le64_to_cpu(jset->seq),
++			  (u64 *) entry - jset->_data,
++			  le32_to_cpu(jset->u64s));
++
++		if (journal_entry_err_on(vstruct_next(entry) >
++					 vstruct_last(jset), c,
++				"journal entry extends past end of jset")) {
++			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
++			break;
++		}
++
++		ret = bch2_journal_entry_validate(c, buf, entry,
++					le32_to_cpu(jset->version),
++					JSET_BIG_ENDIAN(jset), write);
++		if (ret)
++			break;
++	}
++fsck_err:
++	return ret;
++}
++
++static int jset_validate(struct bch_fs *c,
++			 struct bch_dev *ca,
++			 struct jset *jset, u64 sector,
++			 unsigned bucket_sectors_left,
++			 unsigned sectors_read,
++			 int write)
++{
++	size_t bytes = vstruct_bytes(jset);
++	struct bch_csum csum;
++	unsigned version;
++	int ret = 0;
++
++	if (le64_to_cpu(jset->magic) != jset_magic(c))
++		return JOURNAL_ENTRY_NONE;
++
++	version = le32_to_cpu(jset->version);
++	if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
++				  version < bcachefs_metadata_version_min) ||
++				 version >= bcachefs_metadata_version_max, c,
++			"%s sector %llu seq %llu: unknown journal entry version %u",
++			ca ? ca->name : c->name,
++			sector, le64_to_cpu(jset->seq),
++			version)) {
++		/* don't try to continue: */
++		return EINVAL;
++	}
++
++	if (bytes > (sectors_read << 9) &&
++	    sectors_read < bucket_sectors_left)
++		return JOURNAL_ENTRY_REREAD;
++
++	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
++			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
++			ca ? ca->name : c->name,
++			sector, le64_to_cpu(jset->seq), bytes)) {
++		ret = JOURNAL_ENTRY_BAD;
++		le32_add_cpu(&jset->u64s,
++			     -((bytes - (bucket_sectors_left << 9)) / 8));
++	}
++
++	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
++			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
++			ca ? ca->name : c->name,
++			sector, le64_to_cpu(jset->seq),
++			JSET_CSUM_TYPE(jset))) {
++		ret = JOURNAL_ENTRY_BAD;
++		goto csum_done;
++	}
++
++	if (write)
++		goto csum_done;
++
++	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
++	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
++				 "%s sector %llu seq %llu: journal checksum bad",
++				 ca ? ca->name : c->name,
++				 sector, le64_to_cpu(jset->seq)))
++		ret = JOURNAL_ENTRY_BAD;
++
++	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
++		     jset->encrypted_start,
++		     vstruct_end(jset) - (void *) jset->encrypted_start);
++	bch2_fs_fatal_err_on(ret, c,
++			"error decrypting journal entry: %i", ret);
++csum_done:
++	/* last_seq is ignored when JSET_NO_FLUSH is true */
++	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
++				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
++				 "invalid journal entry: last_seq > seq (%llu > %llu)",
++				 le64_to_cpu(jset->last_seq),
++				 le64_to_cpu(jset->seq))) {
++		jset->last_seq = jset->seq;
++		return JOURNAL_ENTRY_BAD;
++	}
++fsck_err:
++	return ret;
++}
++
++static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
++{
++	unsigned sectors = vstruct_sectors(jset, c->block_bits);
++
++	return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
++		jset_validate_entries(c, jset, WRITE);
++}
++
++struct journal_read_buf {
++	void		*data;
++	size_t		size;
++};
++
++static int journal_read_buf_realloc(struct journal_read_buf *b,
++				    size_t new_size)
++{
++	void *n;
++
++	/* the bios are sized for this many pages, max: */
++	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
++		return -ENOMEM;
++
++	new_size = roundup_pow_of_two(new_size);
++	n = kvpmalloc(new_size, GFP_KERNEL);
++	if (!n)
++		return -ENOMEM;
++
++	kvpfree(b->data, b->size);
++	b->data = n;
++	b->size = new_size;
++	return 0;
++}
++
++static int journal_read_bucket(struct bch_dev *ca,
++			       struct journal_read_buf *buf,
++			       struct journal_list *jlist,
++			       unsigned bucket)
++{
++	struct bch_fs *c = ca->fs;
++	struct journal_device *ja = &ca->journal;
++	struct jset *j = NULL;
++	unsigned sectors, sectors_read = 0;
++	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
++	    end = offset + ca->mi.bucket_size;
++	bool saw_bad = false;
++	int ret = 0;
++
++	pr_debug("reading %u", bucket);
++
++	while (offset < end) {
++		if (!sectors_read) {
++			struct bio *bio;
++reread:
++			sectors_read = min_t(unsigned,
++				end - offset, buf->size >> 9);
++
++			bio = bio_kmalloc(GFP_KERNEL,
++					  buf_pages(buf->data,
++						    sectors_read << 9));
++			bio_set_dev(bio, ca->disk_sb.bdev);
++			bio->bi_iter.bi_sector	= offset;
++			bio_set_op_attrs(bio, REQ_OP_READ, 0);
++			bch2_bio_map(bio, buf->data, sectors_read << 9);
++
++			ret = submit_bio_wait(bio);
++			bio_put(bio);
++
++			if (bch2_dev_io_err_on(ret, ca,
++					       "journal read error: sector %llu",
++					       offset) ||
++			    bch2_meta_read_fault("journal")) {
++				/*
++				 * We don't error out of the recovery process
++				 * here, since the relevant journal entry may be
++				 * found on a different device, and missing or
++				 * no journal entries will be handled later
++				 */
++				return 0;
++			}
++
++			j = buf->data;
++		}
++
++		ret = jset_validate(c, ca, j, offset,
++				    end - offset, sectors_read,
++				    READ);
++		switch (ret) {
++		case 0:
++			sectors = vstruct_sectors(j, c->block_bits);
++			break;
++		case JOURNAL_ENTRY_REREAD:
++			if (vstruct_bytes(j) > buf->size) {
++				ret = journal_read_buf_realloc(buf,
++							vstruct_bytes(j));
++				if (ret)
++					return ret;
++			}
++			goto reread;
++		case JOURNAL_ENTRY_NONE:
++			if (!saw_bad)
++				return 0;
++			sectors = block_sectors(c);
++			goto next_block;
++		case JOURNAL_ENTRY_BAD:
++			saw_bad = true;
++			/*
++			 * On checksum error we don't really trust the size
++			 * field of the journal entry we read, so try reading
++			 * again at next block boundary:
++			 */
++			sectors = block_sectors(c);
++			break;
++		default:
++			return ret;
++		}
++
++		/*
++		 * This happens sometimes if we don't have discards on -
++		 * when we've partially overwritten a bucket with new
++		 * journal entries. We don't need the rest of the
++		 * bucket:
++		 */
++		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
++			return 0;
++
++		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
++
++		mutex_lock(&jlist->lock);
++		ret = journal_entry_add(c, ca, (struct journal_ptr) {
++					.dev		= ca->dev_idx,
++					.bucket		= bucket,
++					.bucket_offset	= offset -
++						bucket_to_sector(ca, ja->buckets[bucket]),
++					.sector		= offset,
++					}, jlist, j, ret != 0);
++		mutex_unlock(&jlist->lock);
++
++		switch (ret) {
++		case JOURNAL_ENTRY_ADD_OK:
++			break;
++		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
++			break;
++		default:
++			return ret;
++		}
++next_block:
++		pr_debug("next");
++		offset		+= sectors;
++		sectors_read	-= sectors;
++		j = ((void *) j) + (sectors << 9);
++	}
++
++	return 0;
++}
++
++static void bch2_journal_read_device(struct closure *cl)
++{
++	struct journal_device *ja =
++		container_of(cl, struct journal_device, read);
++	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
++	struct bch_fs *c = ca->fs;
++	struct journal_list *jlist =
++		container_of(cl->parent, struct journal_list, cl);
++	struct journal_replay *r, **_r;
++	struct genradix_iter iter;
++	struct journal_read_buf buf = { NULL, 0 };
++	u64 min_seq = U64_MAX;
++	unsigned i;
++	int ret = 0;
++
++	if (!ja->nr)
++		goto out;
++
++	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
++	if (ret)
++		goto err;
++
++	pr_debug("%u journal buckets", ja->nr);
++
++	for (i = 0; i < ja->nr; i++) {
++		ret = journal_read_bucket(ca, &buf, jlist, i);
++		if (ret)
++			goto err;
++	}
++
++	/* Find the journal bucket with the highest sequence number: */
++	for (i = 0; i < ja->nr; i++) {
++		if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
++			ja->cur_idx = i;
++
++		min_seq = min(ja->bucket_seq[i], min_seq);
++	}
++
++	/*
++	 * If there's duplicate journal entries in multiple buckets (which
++	 * definitely isn't supposed to happen, but...) - make sure to start
++	 * cur_idx at the last of those buckets, so we don't deadlock trying to
++	 * allocate
++	 */
++	while (ja->bucket_seq[ja->cur_idx] > min_seq &&
++	       ja->bucket_seq[ja->cur_idx] ==
++	       ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
++		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
++
++	ja->sectors_free = ca->mi.bucket_size;
++
++	mutex_lock(&jlist->lock);
++	genradix_for_each(&c->journal_entries, iter, _r) {
++		r = *_r;
++
++		if (!r)
++			continue;
++
++		for (i = 0; i < r->nr_ptrs; i++) {
++			if (r->ptrs[i].dev == ca->dev_idx &&
++			    sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
++				unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
++					vstruct_sectors(&r->j, c->block_bits);
++
++				ja->sectors_free = min(ja->sectors_free,
++						       ca->mi.bucket_size - wrote);
++			}
++		}
++	}
++	mutex_unlock(&jlist->lock);
++
++	if (ja->bucket_seq[ja->cur_idx] &&
++	    ja->sectors_free == ca->mi.bucket_size) {
++		bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
++		bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
++		for (i = 0; i < 3; i++) {
++			unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
++			bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
++		}
++		ja->sectors_free = 0;
++	}
++
++	/*
++	 * Set dirty_idx to indicate the entire journal is full and needs to be
++	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
++	 * pinned when it first runs:
++	 */
++	ja->discard_idx = ja->dirty_idx_ondisk =
++		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
++out:
++	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
++	kvpfree(buf.data, buf.size);
++	percpu_ref_put(&ca->io_ref);
++	closure_return(cl);
++	return;
++err:
++	mutex_lock(&jlist->lock);
++	jlist->ret = ret;
++	mutex_unlock(&jlist->lock);
++	goto out;
++}
++
++void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
++			       struct journal_replay *j)
++{
++	unsigned i;
++
++	for (i = 0; i < j->nr_ptrs; i++) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
++		u64 offset;
++
++		div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
++
++		if (i)
++			prt_printf(out, " ");
++		prt_printf(out, "%u:%u:%u (sector %llu)",
++		       j->ptrs[i].dev,
++		       j->ptrs[i].bucket,
++		       j->ptrs[i].bucket_offset,
++		       j->ptrs[i].sector);
++	}
++}
++
++int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
++{
++	struct journal_list jlist;
++	struct journal_replay *i, **_i, *prev = NULL;
++	struct genradix_iter radix_iter;
++	struct bch_dev *ca;
++	unsigned iter;
++	struct printbuf buf = PRINTBUF;
++	size_t keys = 0, entries = 0;
++	bool degraded = false;
++	u64 seq, last_seq = 0;
++	int ret = 0;
++
++	closure_init_stack(&jlist.cl);
++	mutex_init(&jlist.lock);
++	jlist.last_seq = 0;
++	jlist.ret = 0;
++
++	for_each_member_device(ca, c, iter) {
++		if (!c->opts.fsck &&
++		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
++			continue;
++
++		if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
++		     ca->mi.state == BCH_MEMBER_STATE_ro) &&
++		    percpu_ref_tryget(&ca->io_ref))
++			closure_call(&ca->journal.read,
++				     bch2_journal_read_device,
++				     system_unbound_wq,
++				     &jlist.cl);
++		else
++			degraded = true;
++	}
++
++	closure_sync(&jlist.cl);
++
++	if (jlist.ret)
++		return jlist.ret;
++
++	*start_seq = 0;
++
++	/*
++	 * Find most recent flush entry, and ignore newer non flush entries -
++	 * those entries will be blacklisted:
++	 */
++	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
++		i = *_i;
++
++		if (!i || i->ignore)
++			continue;
++
++		if (!*start_seq)
++			*start_seq = le64_to_cpu(i->j.seq) + 1;
++
++		if (!JSET_NO_FLUSH(&i->j)) {
++			last_seq	= le64_to_cpu(i->j.last_seq);
++			*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
++			break;
++		}
++
++		journal_replay_free(c, i);
++	}
++
++	if (!*start_seq) {
++		bch_info(c, "journal read done, but no entries found");
++		return 0;
++	}
++
++	if (!last_seq) {
++		fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
++		ret = -1;
++		goto err;
++	}
++
++	/* Drop blacklisted entries and entries older than last_seq: */
++	genradix_for_each(&c->journal_entries, radix_iter, _i) {
++		i = *_i;
++
++		if (!i || i->ignore)
++			continue;
++
++		seq = le64_to_cpu(i->j.seq);
++		if (seq < last_seq) {
++			journal_replay_free(c, i);
++			continue;
++		}
++
++		if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
++			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
++				    "found blacklisted journal entry %llu", seq);
++
++			journal_replay_free(c, i);
++		}
++	}
++
++	/* Check for missing entries: */
++	seq = last_seq;
++	genradix_for_each(&c->journal_entries, radix_iter, _i) {
++		i = *_i;
++
++		if (!i || i->ignore)
++			continue;
++
++		BUG_ON(seq > le64_to_cpu(i->j.seq));
++
++		while (seq < le64_to_cpu(i->j.seq)) {
++			u64 missing_start, missing_end;
++			struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
++
++			while (seq < le64_to_cpu(i->j.seq) &&
++			       bch2_journal_seq_is_blacklisted(c, seq, false))
++				seq++;
++
++			if (seq == le64_to_cpu(i->j.seq))
++				break;
++
++			missing_start = seq;
++
++			while (seq < le64_to_cpu(i->j.seq) &&
++			       !bch2_journal_seq_is_blacklisted(c, seq, false))
++				seq++;
++
++			if (prev) {
++				bch2_journal_ptrs_to_text(&buf1, c, prev);
++				prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
++			} else
++				prt_printf(&buf1, "(none)");
++			bch2_journal_ptrs_to_text(&buf2, c, i);
++
++			missing_end = seq - 1;
++			fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
++				 "  prev at %s\n"
++				 "  next at %s",
++				 missing_start, missing_end,
++				 last_seq, *blacklist_seq - 1,
++				 buf1.buf, buf2.buf);
++
++			printbuf_exit(&buf1);
++			printbuf_exit(&buf2);
++		}
++
++		prev = i;
++		seq++;
++	}
++
++	genradix_for_each(&c->journal_entries, radix_iter, _i) {
++		struct jset_entry *entry;
++		struct bkey_i *k, *_n;
++		struct bch_replicas_padded replicas = {
++			.e.data_type = BCH_DATA_journal,
++			.e.nr_required = 1,
++		};
++		unsigned ptr;
++
++		i = *_i;
++		if (!i || i->ignore)
++			continue;
++
++		ret = jset_validate_entries(c, &i->j, READ);
++		if (ret)
++			goto err;
++
++		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
++			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
++
++		bch2_replicas_entry_sort(&replicas.e);
++
++		/*
++		 * If we're mounting in degraded mode - if we didn't read all
++		 * the devices - this is wrong:
++		 */
++
++		printbuf_reset(&buf);
++		bch2_replicas_entry_to_text(&buf, &replicas.e);
++
++		if (!degraded &&
++		    fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
++				"superblock not marked as containing replicas %s",
++				buf.buf)) {
++			ret = bch2_mark_replicas(c, &replicas.e);
++			if (ret)
++				goto err;
++		}
++
++		for_each_jset_key(k, _n, entry, &i->j)
++			keys++;
++		entries++;
++	}
++
++	bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
++		 keys, entries, *start_seq);
++
++	if (*start_seq != *blacklist_seq)
++		bch_info(c, "dropped unflushed entries %llu-%llu",
++			 *blacklist_seq, *start_seq - 1);
++err:
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++/* journal write: */
++
++static void __journal_write_alloc(struct journal *j,
++				  struct journal_buf *w,
++				  struct dev_alloc_list *devs_sorted,
++				  unsigned sectors,
++				  unsigned *replicas,
++				  unsigned replicas_want)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct journal_device *ja;
++	struct bch_dev *ca;
++	unsigned i;
++
++	if (*replicas >= replicas_want)
++		return;
++
++	for (i = 0; i < devs_sorted->nr; i++) {
++		ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
++		if (!ca)
++			continue;
++
++		ja = &ca->journal;
++
++		/*
++		 * Check that we can use this device, and aren't already using
++		 * it:
++		 */
++		if (!ca->mi.durability ||
++		    ca->mi.state != BCH_MEMBER_STATE_rw ||
++		    !ja->nr ||
++		    bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
++					 ca->dev_idx) ||
++		    sectors > ja->sectors_free)
++			continue;
++
++		bch2_dev_stripe_increment(ca, &j->wp.stripe);
++
++		bch2_bkey_append_ptr(&w->key,
++			(struct bch_extent_ptr) {
++				  .offset = bucket_to_sector(ca,
++					ja->buckets[ja->cur_idx]) +
++					ca->mi.bucket_size -
++					ja->sectors_free,
++				  .dev = ca->dev_idx,
++		});
++
++		ja->sectors_free -= sectors;
++		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
++
++		*replicas += ca->mi.durability;
++
++		if (*replicas >= replicas_want)
++			break;
++	}
++}
++
++/**
++ * journal_next_bucket - move on to the next journal bucket if possible
++ */
++static int journal_write_alloc(struct journal *j, struct journal_buf *w,
++			       unsigned sectors)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct bch_devs_mask devs;
++	struct journal_device *ja;
++	struct bch_dev *ca;
++	struct dev_alloc_list devs_sorted;
++	unsigned target = c->opts.metadata_target ?:
++		c->opts.foreground_target;
++	unsigned i, replicas = 0, replicas_want =
++		READ_ONCE(c->opts.metadata_replicas);
++
++	rcu_read_lock();
++retry:
++	devs = target_rw_devs(c, BCH_DATA_journal, target);
++
++	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
++
++	__journal_write_alloc(j, w, &devs_sorted,
++			      sectors, &replicas, replicas_want);
++
++	if (replicas >= replicas_want)
++		goto done;
++
++	for (i = 0; i < devs_sorted.nr; i++) {
++		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
++		if (!ca)
++			continue;
++
++		ja = &ca->journal;
++
++		if (sectors > ja->sectors_free &&
++		    sectors <= ca->mi.bucket_size &&
++		    bch2_journal_dev_buckets_available(j, ja,
++					journal_space_discarded)) {
++			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
++			ja->sectors_free = ca->mi.bucket_size;
++
++			/*
++			 * ja->bucket_seq[ja->cur_idx] must always have
++			 * something sensible:
++			 */
++			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
++		}
++	}
++
++	__journal_write_alloc(j, w, &devs_sorted,
++			      sectors, &replicas, replicas_want);
++
++	if (replicas < replicas_want && target) {
++		/* Retry from all devices: */
++		target = 0;
++		goto retry;
++	}
++done:
++	rcu_read_unlock();
++
++	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
++
++	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
++}
++
++static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
++{
++	/* we aren't holding j->lock: */
++	unsigned new_size = READ_ONCE(j->buf_size_want);
++	void *new_buf;
++
++	if (buf->buf_size >= new_size)
++		return;
++
++	new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
++	if (!new_buf)
++		return;
++
++	memcpy(new_buf, buf->data, buf->buf_size);
++
++	spin_lock(&j->lock);
++	swap(buf->data,		new_buf);
++	swap(buf->buf_size,	new_size);
++	spin_unlock(&j->lock);
++
++	kvpfree(new_buf, new_size);
++}
++
++static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
++{
++	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
++}
++
++static void journal_write_done(struct closure *cl)
++{
++	struct journal *j = container_of(cl, struct journal, io);
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct journal_buf *w = journal_last_unwritten_buf(j);
++	struct bch_replicas_padded replicas;
++	union journal_res_state old, new;
++	u64 v, seq;
++	int err = 0;
++
++	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
++			       ? j->flush_write_time
++			       : j->noflush_write_time, j->write_start_time);
++
++	if (!w->devs_written.nr) {
++		bch_err(c, "unable to write journal to sufficient devices");
++		err = -EIO;
++	} else {
++		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
++					 w->devs_written);
++		if (bch2_mark_replicas(c, &replicas.e))
++			err = -EIO;
++	}
++
++	if (err)
++		bch2_fatal_error(c);
++
++	spin_lock(&j->lock);
++	seq = le64_to_cpu(w->data->seq);
++
++	if (seq >= j->pin.front)
++		journal_seq_pin(j, seq)->devs = w->devs_written;
++
++	if (!err) {
++		if (!JSET_NO_FLUSH(w->data)) {
++			j->flushed_seq_ondisk = seq;
++			j->last_seq_ondisk = w->last_seq;
++
++			bch2_do_discards(c);
++			closure_wake_up(&c->freelist_wait);
++		}
++	} else if (!j->err_seq || seq < j->err_seq)
++		j->err_seq	= seq;
++
++	j->seq_ondisk		= seq;
++
++	/*
++	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
++	 * more buckets:
++	 *
++	 * Must come before signaling write completion, for
++	 * bch2_fs_journal_stop():
++	 */
++	if (j->watermark)
++		journal_reclaim_kick(&c->journal);
++
++	/* also must come before signalling write completion: */
++	closure_debug_destroy(cl);
++
++	v = atomic64_read(&j->reservations.counter);
++	do {
++		old.v = new.v = v;
++		BUG_ON(journal_state_count(new, new.unwritten_idx));
++
++		new.unwritten_idx++;
++	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
++				       old.v, new.v)) != old.v);
++
++	bch2_journal_space_available(j);
++
++	closure_wake_up(&w->wait);
++	journal_wake(j);
++
++	if (!journal_state_count(new, new.unwritten_idx) &&
++	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
++		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
++	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
++		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
++		struct journal_buf *buf = journal_cur_buf(j);
++		long delta = buf->expires - jiffies;
++
++		/*
++		 * We don't close a journal entry to write it while there's
++		 * previous entries still in flight - the current journal entry
++		 * might want to be written now:
++		 */
++
++		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
++	}
++
++	spin_unlock(&j->lock);
++}
++
++static void journal_write_endio(struct bio *bio)
++{
++	struct bch_dev *ca = bio->bi_private;
++	struct journal *j = &ca->fs->journal;
++	struct journal_buf *w = journal_last_unwritten_buf(j);
++	unsigned long flags;
++
++	if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
++			       le64_to_cpu(w->data->seq),
++			       bch2_blk_status_to_str(bio->bi_status)) ||
++	    bch2_meta_write_fault("journal")) {
++		spin_lock_irqsave(&j->err_lock, flags);
++		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
++		spin_unlock_irqrestore(&j->err_lock, flags);
++	}
++
++	closure_put(&j->io);
++	percpu_ref_put(&ca->io_ref);
++}
++
++static void do_journal_write(struct closure *cl)
++{
++	struct journal *j = container_of(cl, struct journal, io);
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct bch_dev *ca;
++	struct journal_buf *w = journal_last_unwritten_buf(j);
++	struct bch_extent_ptr *ptr;
++	struct bio *bio;
++	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
++
++	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
++		ca = bch_dev_bkey_exists(c, ptr->dev);
++		if (!percpu_ref_tryget(&ca->io_ref)) {
++			/* XXX: fix this */
++			bch_err(c, "missing device for journal write\n");
++			continue;
++		}
++
++		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
++			     sectors);
++
++		bio = ca->journal.bio;
++		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
++		bio->bi_iter.bi_sector	= ptr->offset;
++		bio->bi_end_io		= journal_write_endio;
++		bio->bi_private		= ca;
++
++		BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
++		ca->prev_journal_sector = bio->bi_iter.bi_sector;
++
++		if (!JSET_NO_FLUSH(w->data))
++			bio->bi_opf    |= REQ_FUA;
++		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
++			bio->bi_opf    |= REQ_PREFLUSH;
++
++		bch2_bio_map(bio, w->data, sectors << 9);
++
++		trace_journal_write(bio);
++		closure_bio_submit(bio, cl);
++
++		ca->journal.bucket_seq[ca->journal.cur_idx] =
++			le64_to_cpu(w->data->seq);
++	}
++
++	continue_at(cl, journal_write_done, c->io_complete_wq);
++	return;
++}
++
++void bch2_journal_write(struct closure *cl)
++{
++	struct journal *j = container_of(cl, struct journal, io);
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct bch_dev *ca;
++	struct journal_buf *w = journal_last_unwritten_buf(j);
++	struct jset_entry *start, *end;
++	struct jset *jset;
++	struct bio *bio;
++	struct printbuf journal_debug_buf = PRINTBUF;
++	bool validate_before_checksum = false;
++	unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
++	int ret;
++
++	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
++
++	journal_buf_realloc(j, w);
++	jset = w->data;
++
++	j->write_start_time = local_clock();
++
++	spin_lock(&j->lock);
++	if (bch2_journal_error(j) ||
++	    w->noflush ||
++	    (!w->must_flush &&
++	     (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
++	     test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
++		w->noflush = true;
++		SET_JSET_NO_FLUSH(jset, true);
++		jset->last_seq	= 0;
++		w->last_seq	= 0;
++
++		j->nr_noflush_writes++;
++	} else {
++		j->last_flush_write = jiffies;
++		j->nr_flush_writes++;
++	}
++	spin_unlock(&j->lock);
++
++	/*
++	 * New btree roots are set by journalling them; when the journal entry
++	 * gets written we have to propagate them to c->btree_roots
++	 *
++	 * But, every journal entry we write has to contain all the btree roots
++	 * (at least for now); so after we copy btree roots to c->btree_roots we
++	 * have to get any missing btree roots and add them to this journal
++	 * entry:
++	 */
++
++	bch2_journal_entries_to_btree_roots(c, jset);
++
++	start = end = vstruct_last(jset);
++
++	end	= bch2_btree_roots_to_journal_entries(c, jset->start, end);
++
++	bch2_journal_super_entries_add_common(c, &end,
++				le64_to_cpu(jset->seq));
++	u64s	= (u64 *) end - (u64 *) start;
++	BUG_ON(u64s > j->entry_u64s_reserved);
++
++	le32_add_cpu(&jset->u64s, u64s);
++	BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
++
++	jset->magic		= cpu_to_le64(jset_magic(c));
++	jset->version		= c->sb.version < bcachefs_metadata_version_bkey_renumber
++		? cpu_to_le32(BCH_JSET_VERSION_OLD)
++		: cpu_to_le32(c->sb.version);
++
++	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
++	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
++
++	if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
++		j->last_empty_seq = le64_to_cpu(jset->seq);
++
++	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
++		validate_before_checksum = true;
++
++	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
++		validate_before_checksum = true;
++
++	if (validate_before_checksum &&
++	    jset_validate_for_write(c, jset))
++		goto err;
++
++	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
++		    jset->encrypted_start,
++		    vstruct_end(jset) - (void *) jset->encrypted_start);
++	if (bch2_fs_fatal_err_on(ret, c,
++			"error decrypting journal entry: %i", ret))
++		goto err;
++
++	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
++				  journal_nonce(jset), jset);
++
++	if (!validate_before_checksum &&
++	    jset_validate_for_write(c, jset))
++		goto err;
++
++	sectors = vstruct_sectors(jset, c->block_bits);
++	BUG_ON(sectors > w->sectors);
++
++	bytes = vstruct_bytes(jset);
++	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
++
++retry_alloc:
++	spin_lock(&j->lock);
++	ret = journal_write_alloc(j, w, sectors);
++
++	if (ret && j->can_discard) {
++		spin_unlock(&j->lock);
++		bch2_journal_do_discards(j);
++		goto retry_alloc;
++	}
++
++	if (ret)
++		__bch2_journal_debug_to_text(&journal_debug_buf, j);
++
++	/*
++	 * write is allocated, no longer need to account for it in
++	 * bch2_journal_space_available():
++	 */
++	w->sectors = 0;
++
++	/*
++	 * journal entry has been compacted and allocated, recalculate space
++	 * available:
++	 */
++	bch2_journal_space_available(j);
++	spin_unlock(&j->lock);
++
++	if (ret) {
++		bch_err(c, "Unable to allocate journal write:\n%s",
++			journal_debug_buf.buf);
++		printbuf_exit(&journal_debug_buf);
++		bch2_fatal_error(c);
++		continue_at(cl, journal_write_done, c->io_complete_wq);
++		return;
++	}
++
++	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
++
++	if (c->opts.nochanges)
++		goto no_io;
++
++	for_each_rw_member(ca, c, i)
++		nr_rw_members++;
++
++	if (nr_rw_members > 1)
++		w->separate_flush = true;
++
++	if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
++		for_each_rw_member(ca, c, i) {
++			percpu_ref_get(&ca->io_ref);
++
++			bio = ca->journal.bio;
++			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
++			bio->bi_end_io		= journal_write_endio;
++			bio->bi_private		= ca;
++			closure_bio_submit(bio, cl);
++		}
++	}
++
++	continue_at(cl, do_journal_write, c->io_complete_wq);
++	return;
++no_io:
++	continue_at(cl, journal_write_done, c->io_complete_wq);
++	return;
++err:
++	bch2_fatal_error(c);
++	continue_at(cl, journal_write_done, c->io_complete_wq);
++}
+diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
+new file mode 100644
+index 000000000000..30e995c81fc4
+--- /dev/null
++++ b/fs/bcachefs/journal_io.h
+@@ -0,0 +1,59 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_JOURNAL_IO_H
++#define _BCACHEFS_JOURNAL_IO_H
++
++/*
++ * Only used for holding the journal entries we read in btree_journal_read()
++ * during cache_registration
++ */
++struct journal_replay {
++	struct journal_ptr {
++		u8		dev;
++		u32		bucket;
++		u32		bucket_offset;
++		u64		sector;
++	}			ptrs[BCH_REPLICAS_MAX];
++	unsigned		nr_ptrs;
++
++	/* checksum error, but we may want to try using it anyways: */
++	bool			bad;
++	bool			ignore;
++	/* must be last: */
++	struct jset		j;
++};
++
++static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
++					struct jset_entry *entry, unsigned type)
++{
++	while (entry < vstruct_last(jset)) {
++		if (entry->type == type)
++			return entry;
++
++		entry = vstruct_next(entry);
++	}
++
++	return NULL;
++}
++
++#define for_each_jset_entry_type(entry, jset, type)			\
++	for (entry = (jset)->start;					\
++	     (entry = __jset_entry_type_next(jset, entry, type));	\
++	     entry = vstruct_next(entry))
++
++#define for_each_jset_key(k, _n, entry, jset)				\
++	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
++		vstruct_for_each_safe(entry, k, _n)
++
++int bch2_journal_entry_validate(struct bch_fs *, const char *,
++				struct jset_entry *, unsigned, int, int);
++void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
++				struct jset_entry *);
++
++void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
++			       struct journal_replay *);
++
++int bch2_journal_read(struct bch_fs *, u64 *, u64 *);
++
++void bch2_journal_write(struct closure *);
++
++#endif /* _BCACHEFS_JOURNAL_IO_H */
+diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
+new file mode 100644
+index 000000000000..6f0ab411c98e
+--- /dev/null
++++ b/fs/bcachefs/journal_reclaim.c
+@@ -0,0 +1,852 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "btree_key_cache.h"
++#include "errcode.h"
++#include "error.h"
++#include "journal.h"
++#include "journal_io.h"
++#include "journal_reclaim.h"
++#include "replicas.h"
++#include "super.h"
++
++#include <linux/kthread.h>
++#include <linux/sched/mm.h>
++#include <trace/events/bcachefs.h>
++
++/* Free space calculations: */
++
++static unsigned journal_space_from(struct journal_device *ja,
++				   enum journal_space_from from)
++{
++	switch (from) {
++	case journal_space_discarded:
++		return ja->discard_idx;
++	case journal_space_clean_ondisk:
++		return ja->dirty_idx_ondisk;
++	case journal_space_clean:
++		return ja->dirty_idx;
++	default:
++		BUG();
++	}
++}
++
++unsigned bch2_journal_dev_buckets_available(struct journal *j,
++					    struct journal_device *ja,
++					    enum journal_space_from from)
++{
++	unsigned available = (journal_space_from(ja, from) -
++			      ja->cur_idx - 1 + ja->nr) % ja->nr;
++
++	/*
++	 * Don't use the last bucket unless writing the new last_seq
++	 * will make another bucket available:
++	 */
++	if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
++		--available;
++
++	return available;
++}
++
++static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
++{
++	union journal_preres_state old, new;
++	u64 v = atomic64_read(&j->prereserved.counter);
++
++	do {
++		old.v = new.v = v;
++		new.remaining = u64s_remaining;
++	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
++				       old.v, new.v)) != old.v);
++}
++
++static struct journal_space
++journal_dev_space_available(struct journal *j, struct bch_dev *ca,
++			    enum journal_space_from from)
++{
++	struct journal_device *ja = &ca->journal;
++	unsigned sectors, buckets, unwritten;
++	u64 seq;
++
++	if (from == journal_space_total)
++		return (struct journal_space) {
++			.next_entry	= ca->mi.bucket_size,
++			.total		= ca->mi.bucket_size * ja->nr,
++		};
++
++	buckets = bch2_journal_dev_buckets_available(j, ja, from);
++	sectors = ja->sectors_free;
++
++	/*
++	 * We that we don't allocate the space for a journal entry
++	 * until we write it out - thus, account for it here:
++	 */
++	for (seq = journal_last_unwritten_seq(j);
++	     seq <= journal_cur_seq(j);
++	     seq++) {
++		unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
++
++		if (!unwritten)
++			continue;
++
++		/* entry won't fit on this device, skip: */
++		if (unwritten > ca->mi.bucket_size)
++			continue;
++
++		if (unwritten >= sectors) {
++			if (!buckets) {
++				sectors = 0;
++				break;
++			}
++
++			buckets--;
++			sectors = ca->mi.bucket_size;
++		}
++
++		sectors -= unwritten;
++	}
++
++	if (sectors < ca->mi.bucket_size && buckets) {
++		buckets--;
++		sectors = ca->mi.bucket_size;
++	}
++
++	return (struct journal_space) {
++		.next_entry	= sectors,
++		.total		= sectors + buckets * ca->mi.bucket_size,
++	};
++}
++
++static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
++			    enum journal_space_from from)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct bch_dev *ca;
++	unsigned i, pos, nr_devs = 0;
++	struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
++
++	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
++
++	rcu_read_lock();
++	for_each_member_device_rcu(ca, c, i,
++				   &c->rw_devs[BCH_DATA_journal]) {
++		if (!ca->journal.nr)
++			continue;
++
++		space = journal_dev_space_available(j, ca, from);
++		if (!space.next_entry)
++			continue;
++
++		for (pos = 0; pos < nr_devs; pos++)
++			if (space.total > dev_space[pos].total)
++				break;
++
++		array_insert_item(dev_space, nr_devs, pos, space);
++	}
++	rcu_read_unlock();
++
++	if (nr_devs < nr_devs_want)
++		return (struct journal_space) { 0, 0 };
++
++	/*
++	 * We sorted largest to smallest, and we want the smallest out of the
++	 * @nr_devs_want largest devices:
++	 */
++	return dev_space[nr_devs_want - 1];
++}
++
++void bch2_journal_space_available(struct journal *j)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct bch_dev *ca;
++	unsigned clean, clean_ondisk, total;
++	s64 u64s_remaining = 0;
++	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
++				       j->buf[1].buf_size >> 9);
++	unsigned i, nr_online = 0, nr_devs_want;
++	bool can_discard = false;
++	int ret = 0;
++
++	lockdep_assert_held(&j->lock);
++
++	rcu_read_lock();
++	for_each_member_device_rcu(ca, c, i,
++				   &c->rw_devs[BCH_DATA_journal]) {
++		struct journal_device *ja = &ca->journal;
++
++		if (!ja->nr)
++			continue;
++
++		while (ja->dirty_idx != ja->cur_idx &&
++		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
++			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
++
++		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
++		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
++			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
++
++		if (ja->discard_idx != ja->dirty_idx_ondisk)
++			can_discard = true;
++
++		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
++		nr_online++;
++	}
++	rcu_read_unlock();
++
++	j->can_discard = can_discard;
++
++	if (nr_online < c->opts.metadata_replicas_required) {
++		ret = JOURNAL_ERR_insufficient_devices;
++		goto out;
++	}
++
++	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
++
++	for (i = 0; i < journal_space_nr; i++)
++		j->space[i] = __journal_space_available(j, nr_devs_want, i);
++
++	clean_ondisk	= j->space[journal_space_clean_ondisk].total;
++	clean		= j->space[journal_space_clean].total;
++	total		= j->space[journal_space_total].total;
++
++	if (!clean_ondisk &&
++	    journal_cur_seq(j) == j->seq_ondisk) {
++		struct printbuf buf = PRINTBUF;
++
++		__bch2_journal_debug_to_text(&buf, j);
++		bch_err(c, "journal stuck\n%s", buf.buf);
++		printbuf_exit(&buf);
++
++		/*
++		 * Hack: bch2_fatal_error() calls bch2_journal_halt() which
++		 * takes journal lock:
++		 */
++		spin_unlock(&j->lock);
++		bch2_fatal_error(c);
++		spin_lock(&j->lock);
++
++		ret = JOURNAL_ERR_journal_stuck;
++	} else if (!j->space[journal_space_discarded].next_entry)
++		ret = JOURNAL_ERR_journal_full;
++
++	if ((j->space[journal_space_clean_ondisk].next_entry <
++	     j->space[journal_space_clean_ondisk].total) &&
++	    (clean - clean_ondisk <= total / 8) &&
++	    (clean_ondisk * 2 > clean ))
++		set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
++	else
++		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
++
++	u64s_remaining  = (u64) clean << 6;
++	u64s_remaining -= (u64) total << 3;
++	u64s_remaining = max(0LL, u64s_remaining);
++	u64s_remaining /= 4;
++	u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
++out:
++	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
++	j->cur_entry_error	= ret;
++	journal_set_remaining(j, u64s_remaining);
++	journal_set_watermark(j);
++
++	if (!ret)
++		journal_wake(j);
++}
++
++/* Discards - last part of journal reclaim: */
++
++static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
++{
++	bool ret;
++
++	spin_lock(&j->lock);
++	ret = ja->discard_idx != ja->dirty_idx_ondisk;
++	spin_unlock(&j->lock);
++
++	return ret;
++}
++
++/*
++ * Advance ja->discard_idx as long as it points to buckets that are no longer
++ * dirty, issuing discards if necessary:
++ */
++void bch2_journal_do_discards(struct journal *j)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct bch_dev *ca;
++	unsigned iter;
++
++	mutex_lock(&j->discard_lock);
++
++	for_each_rw_member(ca, c, iter) {
++		struct journal_device *ja = &ca->journal;
++
++		while (should_discard_bucket(j, ja)) {
++			if (!c->opts.nochanges &&
++			    ca->mi.discard &&
++			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
++				blkdev_issue_discard(ca->disk_sb.bdev,
++					bucket_to_sector(ca,
++						ja->buckets[ja->discard_idx]),
++					ca->mi.bucket_size, GFP_NOIO, 0);
++
++			spin_lock(&j->lock);
++			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
++
++			bch2_journal_space_available(j);
++			spin_unlock(&j->lock);
++		}
++	}
++
++	mutex_unlock(&j->discard_lock);
++}
++
++/*
++ * Journal entry pinning - machinery for holding a reference on a given journal
++ * entry, holding it open to ensure it gets replayed during recovery:
++ */
++
++static void bch2_journal_reclaim_fast(struct journal *j)
++{
++	struct journal_entry_pin_list temp;
++	bool popped = false;
++
++	lockdep_assert_held(&j->lock);
++
++	/*
++	 * Unpin journal entries whose reference counts reached zero, meaning
++	 * all btree nodes got written out
++	 */
++	while (!fifo_empty(&j->pin) &&
++	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
++		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
++		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
++		BUG_ON(!fifo_pop(&j->pin, temp));
++		popped = true;
++	}
++
++	if (popped)
++		bch2_journal_space_available(j);
++}
++
++void __bch2_journal_pin_put(struct journal *j, u64 seq)
++{
++	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
++
++	if (atomic_dec_and_test(&pin_list->count))
++		bch2_journal_reclaim_fast(j);
++}
++
++void bch2_journal_pin_put(struct journal *j, u64 seq)
++{
++	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
++
++	if (atomic_dec_and_test(&pin_list->count)) {
++		spin_lock(&j->lock);
++		bch2_journal_reclaim_fast(j);
++		spin_unlock(&j->lock);
++	}
++}
++
++static inline void __journal_pin_drop(struct journal *j,
++				      struct journal_entry_pin *pin)
++{
++	struct journal_entry_pin_list *pin_list;
++
++	if (!journal_pin_active(pin))
++		return;
++
++	if (j->flush_in_progress == pin)
++		j->flush_in_progress_dropped = true;
++
++	pin_list = journal_seq_pin(j, pin->seq);
++	pin->seq = 0;
++	list_del_init(&pin->list);
++
++	/*
++	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
++	 * writing a new last_seq will now make another bucket available:
++	 */
++	if (atomic_dec_and_test(&pin_list->count) &&
++	    pin_list == &fifo_peek_front(&j->pin))
++		bch2_journal_reclaim_fast(j);
++}
++
++void bch2_journal_pin_drop(struct journal *j,
++			   struct journal_entry_pin *pin)
++{
++	spin_lock(&j->lock);
++	__journal_pin_drop(j, pin);
++	spin_unlock(&j->lock);
++}
++
++void bch2_journal_pin_set(struct journal *j, u64 seq,
++			  struct journal_entry_pin *pin,
++			  journal_pin_flush_fn flush_fn)
++{
++	struct journal_entry_pin_list *pin_list;
++
++	spin_lock(&j->lock);
++
++	if (seq < journal_last_seq(j)) {
++		/*
++		 * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
++		 * the src pin - with the pin dropped, the entry to pin might no
++		 * longer to exist, but that means there's no longer anything to
++		 * copy and we can bail out here:
++		 */
++		spin_unlock(&j->lock);
++		return;
++	}
++
++	pin_list = journal_seq_pin(j, seq);
++
++	__journal_pin_drop(j, pin);
++
++	atomic_inc(&pin_list->count);
++	pin->seq	= seq;
++	pin->flush	= flush_fn;
++
++	if (flush_fn == bch2_btree_key_cache_journal_flush)
++		list_add(&pin->list, &pin_list->key_cache_list);
++	else if (flush_fn)
++		list_add(&pin->list, &pin_list->list);
++	else
++		list_add(&pin->list, &pin_list->flushed);
++	spin_unlock(&j->lock);
++
++	/*
++	 * If the journal is currently full,  we might want to call flush_fn
++	 * immediately:
++	 */
++	journal_wake(j);
++}
++
++/**
++ * bch2_journal_pin_flush: ensure journal pin callback is no longer running
++ */
++void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
++{
++	BUG_ON(journal_pin_active(pin));
++
++	wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
++}
++
++/*
++ * Journal reclaim: flush references to open journal entries to reclaim space in
++ * the journal
++ *
++ * May be done by the journal code in the background as needed to free up space
++ * for more journal entries, or as part of doing a clean shutdown, or to migrate
++ * data off of a specific device:
++ */
++
++static struct journal_entry_pin *
++journal_get_next_pin(struct journal *j,
++		     bool get_any,
++		     bool get_key_cache,
++		     u64 max_seq, u64 *seq)
++{
++	struct journal_entry_pin_list *pin_list;
++	struct journal_entry_pin *ret = NULL;
++
++	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
++		if (*seq > max_seq && !get_any && !get_key_cache)
++			break;
++
++		if (*seq <= max_seq || get_any) {
++			ret = list_first_entry_or_null(&pin_list->list,
++				struct journal_entry_pin, list);
++			if (ret)
++				return ret;
++		}
++
++		if (*seq <= max_seq || get_any || get_key_cache) {
++			ret = list_first_entry_or_null(&pin_list->key_cache_list,
++				struct journal_entry_pin, list);
++			if (ret)
++				return ret;
++		}
++	}
++
++	return NULL;
++}
++
++/* returns true if we did work */
++static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
++				 unsigned min_any,
++				 unsigned min_key_cache)
++{
++	struct journal_entry_pin *pin;
++	size_t nr_flushed = 0;
++	journal_pin_flush_fn flush_fn;
++	u64 seq;
++	int err;
++
++	lockdep_assert_held(&j->reclaim_lock);
++
++	while (1) {
++		cond_resched();
++
++		j->last_flushed = jiffies;
++
++		spin_lock(&j->lock);
++		pin = journal_get_next_pin(j,
++					   min_any != 0,
++					   min_key_cache != 0,
++					   seq_to_flush, &seq);
++		if (pin) {
++			BUG_ON(j->flush_in_progress);
++			j->flush_in_progress = pin;
++			j->flush_in_progress_dropped = false;
++			flush_fn = pin->flush;
++		}
++		spin_unlock(&j->lock);
++
++		if (!pin)
++			break;
++
++		if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
++			min_key_cache--;
++
++		if (min_any)
++			min_any--;
++
++		err = flush_fn(j, pin, seq);
++
++		spin_lock(&j->lock);
++		/* Pin might have been dropped or rearmed: */
++		if (likely(!err && !j->flush_in_progress_dropped))
++			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
++		j->flush_in_progress = NULL;
++		j->flush_in_progress_dropped = false;
++		spin_unlock(&j->lock);
++
++		wake_up(&j->pin_flush_wait);
++
++		if (err)
++			break;
++
++		nr_flushed++;
++	}
++
++	return nr_flushed;
++}
++
++static u64 journal_seq_to_flush(struct journal *j)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct bch_dev *ca;
++	u64 seq_to_flush = 0;
++	unsigned iter;
++
++	spin_lock(&j->lock);
++
++	for_each_rw_member(ca, c, iter) {
++		struct journal_device *ja = &ca->journal;
++		unsigned nr_buckets, bucket_to_flush;
++
++		if (!ja->nr)
++			continue;
++
++		/* Try to keep the journal at most half full: */
++		nr_buckets = ja->nr / 2;
++
++		/* And include pre-reservations: */
++		nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
++					   (ca->mi.bucket_size << 6) -
++					   journal_entry_overhead(j));
++
++		nr_buckets = min(nr_buckets, ja->nr);
++
++		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
++		seq_to_flush = max(seq_to_flush,
++				   ja->bucket_seq[bucket_to_flush]);
++	}
++
++	/* Also flush if the pin fifo is more than half full */
++	seq_to_flush = max_t(s64, seq_to_flush,
++			     (s64) journal_cur_seq(j) -
++			     (j->pin.size >> 1));
++	spin_unlock(&j->lock);
++
++	return seq_to_flush;
++}
++
++/**
++ * bch2_journal_reclaim - free up journal buckets
++ *
++ * Background journal reclaim writes out btree nodes. It should be run
++ * early enough so that we never completely run out of journal buckets.
++ *
++ * High watermarks for triggering background reclaim:
++ * - FIFO has fewer than 512 entries left
++ * - fewer than 25% journal buckets free
++ *
++ * Background reclaim runs until low watermarks are reached:
++ * - FIFO has more than 1024 entries left
++ * - more than 50% journal buckets free
++ *
++ * As long as a reclaim can complete in the time it takes to fill up
++ * 512 journal entries or 25% of all journal buckets, then
++ * journal_next_bucket() should not stall.
++ */
++static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	bool kthread = (current->flags & PF_KTHREAD) != 0;
++	u64 seq_to_flush;
++	size_t min_nr, min_key_cache, nr_flushed;
++	unsigned flags;
++	int ret = 0;
++
++	/*
++	 * We can't invoke memory reclaim while holding the reclaim_lock -
++	 * journal reclaim is required to make progress for memory reclaim
++	 * (cleaning the caches), so we can't get stuck in memory reclaim while
++	 * we're holding the reclaim lock:
++	 */
++	lockdep_assert_held(&j->reclaim_lock);
++	flags = memalloc_noreclaim_save();
++
++	do {
++		if (kthread && kthread_should_stop())
++			break;
++
++		if (bch2_journal_error(j)) {
++			ret = -EIO;
++			break;
++		}
++
++		bch2_journal_do_discards(j);
++
++		seq_to_flush = journal_seq_to_flush(j);
++		min_nr = 0;
++
++		/*
++		 * If it's been longer than j->reclaim_delay_ms since we last flushed,
++		 * make sure to flush at least one journal pin:
++		 */
++		if (time_after(jiffies, j->last_flushed +
++			       msecs_to_jiffies(c->opts.journal_reclaim_delay)))
++			min_nr = 1;
++
++		if (j->prereserved.reserved * 4 > j->prereserved.remaining)
++			min_nr = 1;
++
++		if (fifo_free(&j->pin) <= 32)
++			min_nr = 1;
++
++		if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
++			min_nr = 1;
++
++		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
++
++		trace_journal_reclaim_start(c, direct, kicked,
++				min_nr, min_key_cache,
++				j->prereserved.reserved,
++				j->prereserved.remaining,
++				atomic_read(&c->btree_cache.dirty),
++				c->btree_cache.used,
++				atomic_long_read(&c->btree_key_cache.nr_dirty),
++				atomic_long_read(&c->btree_key_cache.nr_keys));
++
++		nr_flushed = journal_flush_pins(j, seq_to_flush,
++						min_nr, min_key_cache);
++
++		if (direct)
++			j->nr_direct_reclaim += nr_flushed;
++		else
++			j->nr_background_reclaim += nr_flushed;
++		trace_journal_reclaim_finish(c, nr_flushed);
++
++		if (nr_flushed)
++			wake_up(&j->reclaim_wait);
++	} while ((min_nr || min_key_cache) && nr_flushed && !direct);
++
++	memalloc_noreclaim_restore(flags);
++
++	return ret;
++}
++
++int bch2_journal_reclaim(struct journal *j)
++{
++	return __bch2_journal_reclaim(j, true, true);
++}
++
++static int bch2_journal_reclaim_thread(void *arg)
++{
++	struct journal *j = arg;
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	unsigned long delay, now;
++	bool journal_empty;
++	int ret = 0;
++
++	set_freezable();
++
++	j->last_flushed = jiffies;
++
++	while (!ret && !kthread_should_stop()) {
++		bool kicked = j->reclaim_kicked;
++
++		j->reclaim_kicked = false;
++
++		mutex_lock(&j->reclaim_lock);
++		ret = __bch2_journal_reclaim(j, false, kicked);
++		mutex_unlock(&j->reclaim_lock);
++
++		now = jiffies;
++		delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
++		j->next_reclaim = j->last_flushed + delay;
++
++		if (!time_in_range(j->next_reclaim, now, now + delay))
++			j->next_reclaim = now + delay;
++
++		while (1) {
++			set_current_state(TASK_INTERRUPTIBLE);
++			if (kthread_should_stop())
++				break;
++			if (j->reclaim_kicked)
++				break;
++
++			spin_lock(&j->lock);
++			journal_empty = fifo_empty(&j->pin);
++			spin_unlock(&j->lock);
++
++			if (journal_empty)
++				freezable_schedule();
++			else if (time_after(j->next_reclaim, jiffies))
++				freezable_schedule_timeout(j->next_reclaim - jiffies);
++			else
++				break;
++		}
++		__set_current_state(TASK_RUNNING);
++	}
++
++	return 0;
++}
++
++void bch2_journal_reclaim_stop(struct journal *j)
++{
++	struct task_struct *p = j->reclaim_thread;
++
++	j->reclaim_thread = NULL;
++
++	if (p) {
++		kthread_stop(p);
++		put_task_struct(p);
++	}
++}
++
++int bch2_journal_reclaim_start(struct journal *j)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct task_struct *p;
++	int ret;
++
++	if (j->reclaim_thread)
++		return 0;
++
++	p = kthread_create(bch2_journal_reclaim_thread, j,
++			   "bch-reclaim/%s", c->name);
++	ret = PTR_ERR_OR_ZERO(p);
++	if (ret) {
++		bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
++		return ret;
++	}
++
++	get_task_struct(p);
++	j->reclaim_thread = p;
++	wake_up_process(p);
++	return 0;
++}
++
++static int journal_flush_done(struct journal *j, u64 seq_to_flush,
++			      bool *did_work)
++{
++	int ret;
++
++	ret = bch2_journal_error(j);
++	if (ret)
++		return ret;
++
++	mutex_lock(&j->reclaim_lock);
++
++	if (journal_flush_pins(j, seq_to_flush, 0, 0))
++		*did_work = true;
++
++	spin_lock(&j->lock);
++	/*
++	 * If journal replay hasn't completed, the unreplayed journal entries
++	 * hold refs on their corresponding sequence numbers
++	 */
++	ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
++		journal_last_seq(j) > seq_to_flush ||
++		!fifo_used(&j->pin);
++
++	spin_unlock(&j->lock);
++	mutex_unlock(&j->reclaim_lock);
++
++	return ret;
++}
++
++bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
++{
++	bool did_work = false;
++
++	if (!test_bit(JOURNAL_STARTED, &j->flags))
++		return false;
++
++	closure_wait_event(&j->async_wait,
++		journal_flush_done(j, seq_to_flush, &did_work));
++
++	return did_work;
++}
++
++int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct journal_entry_pin_list *p;
++	u64 iter, seq = 0;
++	int ret = 0;
++
++	spin_lock(&j->lock);
++	fifo_for_each_entry_ptr(p, &j->pin, iter)
++		if (dev_idx >= 0
++		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
++		    : p->devs.nr < c->opts.metadata_replicas)
++			seq = iter;
++	spin_unlock(&j->lock);
++
++	bch2_journal_flush_pins(j, seq);
++
++	ret = bch2_journal_error(j);
++	if (ret)
++		return ret;
++
++	mutex_lock(&c->replicas_gc_lock);
++	bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
++
++	seq = 0;
++
++	spin_lock(&j->lock);
++	while (!ret) {
++		struct bch_replicas_padded replicas;
++
++		seq = max(seq, journal_last_seq(j));
++		if (seq >= j->pin.back)
++			break;
++		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
++					 journal_seq_pin(j, seq)->devs);
++		seq++;
++
++		spin_unlock(&j->lock);
++		ret = bch2_mark_replicas(c, &replicas.e);
++		spin_lock(&j->lock);
++	}
++	spin_unlock(&j->lock);
++
++	ret = bch2_replicas_gc_end(c, ret);
++	mutex_unlock(&c->replicas_gc_lock);
++
++	return ret;
++}
+diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
+new file mode 100644
+index 000000000000..0fd1af120db5
+--- /dev/null
++++ b/fs/bcachefs/journal_reclaim.h
+@@ -0,0 +1,86 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
++#define _BCACHEFS_JOURNAL_RECLAIM_H
++
++#define JOURNAL_PIN	(32 * 1024)
++
++static inline void journal_reclaim_kick(struct journal *j)
++{
++	struct task_struct *p = READ_ONCE(j->reclaim_thread);
++
++	j->reclaim_kicked = true;
++	if (p)
++		wake_up_process(p);
++}
++
++unsigned bch2_journal_dev_buckets_available(struct journal *,
++					    struct journal_device *,
++					    enum journal_space_from);
++void bch2_journal_space_available(struct journal *);
++
++static inline bool journal_pin_active(struct journal_entry_pin *pin)
++{
++	return pin->seq != 0;
++}
++
++static inline struct journal_entry_pin_list *
++journal_seq_pin(struct journal *j, u64 seq)
++{
++	EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
++
++	return &j->pin.data[seq & j->pin.mask];
++}
++
++void __bch2_journal_pin_put(struct journal *, u64);
++void bch2_journal_pin_put(struct journal *, u64);
++void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
++
++void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
++			  journal_pin_flush_fn);
++
++static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
++					struct journal_entry_pin *pin,
++					journal_pin_flush_fn flush_fn)
++{
++	if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
++		bch2_journal_pin_set(j, seq, pin, flush_fn);
++}
++
++static inline void bch2_journal_pin_copy(struct journal *j,
++					 struct journal_entry_pin *dst,
++					 struct journal_entry_pin *src,
++					 journal_pin_flush_fn flush_fn)
++{
++	/* Guard against racing with journal_pin_drop(src): */
++	u64 seq = READ_ONCE(src->seq);
++
++	if (seq)
++		bch2_journal_pin_add(j, seq, dst, flush_fn);
++}
++
++static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
++					   struct journal_entry_pin *pin,
++					   journal_pin_flush_fn flush_fn)
++{
++	if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
++		bch2_journal_pin_set(j, seq, pin, flush_fn);
++}
++
++void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
++
++void bch2_journal_do_discards(struct journal *);
++int bch2_journal_reclaim(struct journal *);
++
++void bch2_journal_reclaim_stop(struct journal *);
++int bch2_journal_reclaim_start(struct journal *);
++
++bool bch2_journal_flush_pins(struct journal *, u64);
++
++static inline bool bch2_journal_flush_all_pins(struct journal *j)
++{
++	return bch2_journal_flush_pins(j, U64_MAX);
++}
++
++int bch2_journal_flush_device_pins(struct journal *, int);
++
++#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
+diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
+new file mode 100644
+index 000000000000..001cecec1291
+--- /dev/null
++++ b/fs/bcachefs/journal_sb.c
+@@ -0,0 +1,220 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "journal_sb.h"
++#include "darray.h"
++
++#include <linux/sort.h>
++
++/* BCH_SB_FIELD_journal: */
++
++static int u64_cmp(const void *_l, const void *_r)
++{
++	const u64 *l = _l;
++	const u64 *r = _r;
++
++	return cmp_int(*l, *r);
++}
++
++static int bch2_sb_journal_validate(struct bch_sb *sb,
++				    struct bch_sb_field *f,
++				    struct printbuf *err)
++{
++	struct bch_sb_field_journal *journal = field_to_type(f, journal);
++	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
++	int ret = -EINVAL;
++	unsigned nr;
++	unsigned i;
++	u64 *b;
++
++	nr = bch2_nr_journal_buckets(journal);
++	if (!nr)
++		return 0;
++
++	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
++	if (!b)
++		return -ENOMEM;
++
++	for (i = 0; i < nr; i++)
++		b[i] = le64_to_cpu(journal->buckets[i]);
++
++	sort(b, nr, sizeof(u64), u64_cmp, NULL);
++
++	if (!b[0]) {
++		prt_printf(err, "journal bucket at sector 0");
++		goto err;
++	}
++
++	if (b[0] < le16_to_cpu(m->first_bucket)) {
++		prt_printf(err, "journal bucket %llu before first bucket %u",
++		       b[0], le16_to_cpu(m->first_bucket));
++		goto err;
++	}
++
++	if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
++		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
++		       b[nr - 1], le64_to_cpu(m->nbuckets));
++		goto err;
++	}
++
++	for (i = 0; i + 1 < nr; i++)
++		if (b[i] == b[i + 1]) {
++			prt_printf(err, "duplicate journal buckets %llu", b[i]);
++			goto err;
++		}
++
++	ret = 0;
++err:
++	kfree(b);
++	return ret;
++}
++
++static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
++				    struct bch_sb_field *f)
++{
++	struct bch_sb_field_journal *journal = field_to_type(f, journal);
++	unsigned i, nr = bch2_nr_journal_buckets(journal);
++
++	prt_printf(out, "Buckets: ");
++	for (i = 0; i < nr; i++)
++		prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
++	prt_newline(out);
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_journal = {
++	.validate	= bch2_sb_journal_validate,
++	.to_text	= bch2_sb_journal_to_text,
++};
++
++struct u64_range {
++	u64	start;
++	u64	end;
++};
++
++static int u64_range_cmp(const void *_l, const void *_r)
++{
++	const struct u64_range *l = _l;
++	const struct u64_range *r = _r;
++
++	return cmp_int(l->start, r->start);
++}
++
++static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
++				    struct bch_sb_field *f,
++				    struct printbuf *err)
++{
++	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
++	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
++	int ret = -EINVAL;
++	unsigned nr;
++	unsigned i;
++	struct u64_range *b;
++
++	nr = bch2_sb_field_journal_v2_nr_entries(journal);
++	if (!nr)
++		return 0;
++
++	b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL);
++	if (!b)
++		return -ENOMEM;
++
++	for (i = 0; i < nr; i++) {
++		b[i].start = le64_to_cpu(journal->d[i].start);
++		b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
++	}
++
++	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
++
++	if (!b[0].start) {
++		prt_printf(err, "journal bucket at sector 0");
++		goto err;
++	}
++
++	if (b[0].start < le16_to_cpu(m->first_bucket)) {
++		prt_printf(err, "journal bucket %llu before first bucket %u",
++		       b[0].start, le16_to_cpu(m->first_bucket));
++		goto err;
++	}
++
++	if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
++		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
++		       b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
++		goto err;
++	}
++
++	for (i = 0; i + 1 < nr; i++) {
++		if (b[i].end > b[i + 1].start) {
++			prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
++			       b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
++			goto err;
++		}
++	}
++
++	ret = 0;
++err:
++	kfree(b);
++	return ret;
++}
++
++static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
++				    struct bch_sb_field *f)
++{
++	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
++	unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
++
++	prt_printf(out, "Buckets: ");
++	for (i = 0; i < nr; i++)
++		prt_printf(out, " %llu-%llu",
++		       le64_to_cpu(journal->d[i].start),
++		       le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
++	prt_newline(out);
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
++	.validate	= bch2_sb_journal_v2_validate,
++	.to_text	= bch2_sb_journal_v2_to_text,
++};
++
++int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
++{
++	struct journal_device *ja = &ca->journal;
++	struct bch_sb_field_journal_v2 *j;
++	unsigned i, dst = 0, nr = 1;
++
++	if (c)
++		lockdep_assert_held(&c->sb_lock);
++
++	if (!ja->nr) {
++		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
++		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
++		return 0;
++	}
++
++	for (i = 0; i + 1 < ja->nr; i++)
++		if (ja->buckets[i] + 1 != ja->buckets[i + 1])
++			nr++;
++
++	j = bch2_sb_resize_journal_v2(&ca->disk_sb,
++				 (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
++	if (!j)
++		return -ENOSPC;
++
++	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
++
++	j->d[dst].start = le64_to_cpu(ja->buckets[0]);
++	j->d[dst].nr	= le64_to_cpu(1);
++
++	for (i = 1; i < ja->nr; i++) {
++		if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
++			le64_add_cpu(&j->d[dst].nr, 1);
++		} else {
++			dst++;
++			j->d[dst].start = le64_to_cpu(ja->buckets[i]);
++			j->d[dst].nr	= le64_to_cpu(1);
++		}
++	}
++
++	BUG_ON(dst + 1 != nr);
++
++	return 0;
++}
+diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
+new file mode 100644
+index 000000000000..a39192e9f6f4
+--- /dev/null
++++ b/fs/bcachefs/journal_sb.h
+@@ -0,0 +1,24 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#include "super-io.h"
++#include "vstructs.h"
++
++static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
++{
++	return j
++		? (__le64 *) vstruct_end(&j->field) - j->buckets
++		: 0;
++}
++
++static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
++{
++	if (!j)
++		return 0;
++
++	return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
++}
++
++extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
++
++int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
+diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
+new file mode 100644
+index 000000000000..5c555b3703c0
+--- /dev/null
++++ b/fs/bcachefs/journal_seq_blacklist.c
+@@ -0,0 +1,322 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "btree_iter.h"
++#include "eytzinger.h"
++#include "journal_seq_blacklist.h"
++#include "super-io.h"
++
++/*
++ * journal_seq_blacklist machinery:
++ *
++ * To guarantee order of btree updates after a crash, we need to detect when a
++ * btree node entry (bset) is newer than the newest journal entry that was
++ * successfully written, and ignore it - effectively ignoring any btree updates
++ * that didn't make it into the journal.
++ *
++ * If we didn't do this, we might have two btree nodes, a and b, both with
++ * updates that weren't written to the journal yet: if b was updated after a,
++ * but b was flushed and not a - oops; on recovery we'll find that the updates
++ * to b happened, but not the updates to a that happened before it.
++ *
++ * Ignoring bsets that are newer than the newest journal entry is always safe,
++ * because everything they contain will also have been journalled - and must
++ * still be present in the journal on disk until a journal entry has been
++ * written _after_ that bset was written.
++ *
++ * To accomplish this, bsets record the newest journal sequence number they
++ * contain updates for; then, on startup, the btree code queries the journal
++ * code to ask "Is this sequence number newer than the newest journal entry? If
++ * so, ignore it."
++ *
++ * When this happens, we must blacklist that journal sequence number: the
++ * journal must not write any entries with that sequence number, and it must
++ * record that it was blacklisted so that a) on recovery we don't think we have
++ * missing journal entries and b) so that the btree code continues to ignore
++ * that bset, until that btree node is rewritten.
++ */
++
++static unsigned sb_blacklist_u64s(unsigned nr)
++{
++	struct bch_sb_field_journal_seq_blacklist *bl;
++
++	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
++}
++
++static struct bch_sb_field_journal_seq_blacklist *
++blacklist_entry_try_merge(struct bch_fs *c,
++			  struct bch_sb_field_journal_seq_blacklist *bl,
++			  unsigned i)
++{
++	unsigned nr = blacklist_nr_entries(bl);
++
++	if (le64_to_cpu(bl->start[i].end) >=
++	    le64_to_cpu(bl->start[i + 1].start)) {
++		bl->start[i].end = bl->start[i + 1].end;
++		--nr;
++		memmove(&bl->start[i],
++			&bl->start[i + 1],
++			sizeof(bl->start[0]) * (nr - i));
++
++		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
++							sb_blacklist_u64s(nr));
++		BUG_ON(!bl);
++	}
++
++	return bl;
++}
++
++static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
++					u64 start, u64 end)
++{
++	return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
++}
++
++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
++{
++	struct bch_sb_field_journal_seq_blacklist *bl;
++	unsigned i, nr;
++	int ret = 0;
++
++	mutex_lock(&c->sb_lock);
++	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
++	nr = blacklist_nr_entries(bl);
++
++	for (i = 0; i < nr; i++) {
++		struct journal_seq_blacklist_entry *e =
++			bl->start + i;
++
++		if (bl_entry_contig_or_overlaps(e, start, end)) {
++			e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
++			e->end	= cpu_to_le64(max(end, le64_to_cpu(e->end)));
++
++			if (i + 1 < nr)
++				bl = blacklist_entry_try_merge(c,
++							bl, i);
++			if (i)
++				bl = blacklist_entry_try_merge(c,
++							bl, i - 1);
++			goto out_write_sb;
++		}
++	}
++
++	bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
++					sb_blacklist_u64s(nr + 1));
++	if (!bl) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	bl->start[nr].start	= cpu_to_le64(start);
++	bl->start[nr].end	= cpu_to_le64(end);
++out_write_sb:
++	c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
++
++	ret = bch2_write_super(c);
++out:
++	mutex_unlock(&c->sb_lock);
++
++	return ret ?: bch2_blacklist_table_initialize(c);
++}
++
++static int journal_seq_blacklist_table_cmp(const void *_l,
++					   const void *_r, size_t size)
++{
++	const struct journal_seq_blacklist_table_entry *l = _l;
++	const struct journal_seq_blacklist_table_entry *r = _r;
++
++	return cmp_int(l->start, r->start);
++}
++
++bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
++				     bool dirty)
++{
++	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
++	struct journal_seq_blacklist_table_entry search = { .start = seq };
++	int idx;
++
++	if (!t)
++		return false;
++
++	idx = eytzinger0_find_le(t->entries, t->nr,
++				 sizeof(t->entries[0]),
++				 journal_seq_blacklist_table_cmp,
++				 &search);
++	if (idx < 0)
++		return false;
++
++	BUG_ON(t->entries[idx].start > seq);
++
++	if (seq >= t->entries[idx].end)
++		return false;
++
++	if (dirty)
++		t->entries[idx].dirty = true;
++	return true;
++}
++
++int bch2_blacklist_table_initialize(struct bch_fs *c)
++{
++	struct bch_sb_field_journal_seq_blacklist *bl =
++		bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
++	struct journal_seq_blacklist_table *t;
++	unsigned i, nr = blacklist_nr_entries(bl);
++
++	if (!bl)
++		return 0;
++
++	t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
++		    GFP_KERNEL);
++	if (!t)
++		return -ENOMEM;
++
++	t->nr = nr;
++
++	for (i = 0; i < nr; i++) {
++		t->entries[i].start	= le64_to_cpu(bl->start[i].start);
++		t->entries[i].end	= le64_to_cpu(bl->start[i].end);
++	}
++
++	eytzinger0_sort(t->entries,
++			t->nr,
++			sizeof(t->entries[0]),
++			journal_seq_blacklist_table_cmp,
++			NULL);
++
++	kfree(c->journal_seq_blacklist_table);
++	c->journal_seq_blacklist_table = t;
++	return 0;
++}
++
++static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
++						  struct bch_sb_field *f,
++						  struct printbuf *err)
++{
++	struct bch_sb_field_journal_seq_blacklist *bl =
++		field_to_type(f, journal_seq_blacklist);
++	unsigned i, nr = blacklist_nr_entries(bl);
++
++	for (i = 0; i < nr; i++) {
++		struct journal_seq_blacklist_entry *e = bl->start + i;
++
++		if (le64_to_cpu(e->start) >=
++		    le64_to_cpu(e->end)) {
++			prt_printf(err, "entry %u start >= end (%llu >= %llu)",
++			       i, le64_to_cpu(e->start), le64_to_cpu(e->end));
++			return -EINVAL;
++		}
++
++		if (i + 1 < nr &&
++		    le64_to_cpu(e[0].end) >
++		    le64_to_cpu(e[1].start)) {
++			prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
++			       i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
++			return -EINVAL;
++		}
++	}
++
++	return 0;
++}
++
++static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
++						  struct bch_sb *sb,
++						  struct bch_sb_field *f)
++{
++	struct bch_sb_field_journal_seq_blacklist *bl =
++		field_to_type(f, journal_seq_blacklist);
++	struct journal_seq_blacklist_entry *i;
++	unsigned nr = blacklist_nr_entries(bl);
++
++	for (i = bl->start; i < bl->start + nr; i++) {
++		if (i != bl->start)
++			prt_printf(out, " ");
++
++		prt_printf(out, "%llu-%llu",
++		       le64_to_cpu(i->start),
++		       le64_to_cpu(i->end));
++	}
++	prt_newline(out);
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
++	.validate	= bch2_sb_journal_seq_blacklist_validate,
++	.to_text	= bch2_sb_journal_seq_blacklist_to_text
++};
++
++void bch2_blacklist_entries_gc(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work, struct bch_fs,
++					journal_seq_blacklist_gc_work);
++	struct journal_seq_blacklist_table *t;
++	struct bch_sb_field_journal_seq_blacklist *bl;
++	struct journal_seq_blacklist_entry *src, *dst;
++	struct btree_trans trans;
++	unsigned i, nr, new_nr;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for (i = 0; i < BTREE_ID_NR; i++) {
++		struct btree_iter iter;
++		struct btree *b;
++
++		bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
++					  0, 0, BTREE_ITER_PREFETCH);
++retry:
++		bch2_trans_begin(&trans);
++
++		b = bch2_btree_iter_peek_node(&iter);
++
++		while (!(ret = PTR_ERR_OR_ZERO(b)) &&
++		       b &&
++		       !test_bit(BCH_FS_STOPPING, &c->flags))
++			b = bch2_btree_iter_next_node(&iter);
++
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			goto retry;
++
++		bch2_trans_iter_exit(&trans, &iter);
++	}
++
++	bch2_trans_exit(&trans);
++	if (ret)
++		return;
++
++	mutex_lock(&c->sb_lock);
++	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
++	if (!bl)
++		goto out;
++
++	nr = blacklist_nr_entries(bl);
++	dst = bl->start;
++
++	t = c->journal_seq_blacklist_table;
++	BUG_ON(nr != t->nr);
++
++	for (src = bl->start, i = eytzinger0_first(t->nr);
++	     src < bl->start + nr;
++	     src++, i = eytzinger0_next(i, nr)) {
++		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
++		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
++
++		if (t->entries[i].dirty)
++			*dst++ = *src;
++	}
++
++	new_nr = dst - bl->start;
++
++	bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
++
++	if (new_nr != nr) {
++		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
++				new_nr ? sb_blacklist_u64s(new_nr) : 0);
++		BUG_ON(new_nr && !bl);
++
++		if (!new_nr)
++			c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
++
++		bch2_write_super(c);
++	}
++out:
++	mutex_unlock(&c->sb_lock);
++}
+diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
+new file mode 100644
+index 000000000000..afb886ec8e25
+--- /dev/null
++++ b/fs/bcachefs/journal_seq_blacklist.h
+@@ -0,0 +1,22 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
++#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
++
++static inline unsigned
++blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
++{
++	return bl
++		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
++		   sizeof(struct journal_seq_blacklist_entry))
++		: 0;
++}
++
++bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
++int bch2_blacklist_table_initialize(struct bch_fs *);
++
++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
++
++void bch2_blacklist_entries_gc(struct work_struct *);
++
++#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
+diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
+new file mode 100644
+index 000000000000..a6cdb885ad41
+--- /dev/null
++++ b/fs/bcachefs/journal_types.h
+@@ -0,0 +1,340 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_JOURNAL_TYPES_H
++#define _BCACHEFS_JOURNAL_TYPES_H
++
++#include <linux/cache.h>
++#include <linux/workqueue.h>
++
++#include "alloc_types.h"
++#include "super_types.h"
++#include "fifo.h"
++
++#define JOURNAL_BUF_BITS	2
++#define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
++#define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
++
++/*
++ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
++ * the journal that are being staged or in flight.
++ */
++struct journal_buf {
++	struct jset		*data;
++
++	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
++	struct bch_devs_list	devs_written;
++
++	struct closure_waitlist	wait;
++	u64			last_seq;	/* copy of data->last_seq */
++	long			expires;
++	u64			flush_time;
++
++	unsigned		buf_size;	/* size in bytes of @data */
++	unsigned		sectors;	/* maximum size for current entry */
++	unsigned		disk_sectors;	/* maximum size entry could have been, if
++						   buf_size was bigger */
++	unsigned		u64s_reserved;
++	bool			noflush;	/* write has already been kicked off, and was noflush */
++	bool			must_flush;	/* something wants a flush */
++	bool			separate_flush;
++};
++
++/*
++ * Something that makes a journal entry dirty - i.e. a btree node that has to be
++ * flushed:
++ */
++
++struct journal_entry_pin_list {
++	struct list_head		list;
++	struct list_head		key_cache_list;
++	struct list_head		flushed;
++	atomic_t			count;
++	struct bch_devs_list		devs;
++};
++
++struct journal;
++struct journal_entry_pin;
++typedef int (*journal_pin_flush_fn)(struct journal *j,
++				struct journal_entry_pin *, u64);
++
++struct journal_entry_pin {
++	struct list_head		list;
++	journal_pin_flush_fn		flush;
++	u64				seq;
++};
++
++struct journal_res {
++	bool			ref;
++	u8			idx;
++	u16			u64s;
++	u32			offset;
++	u64			seq;
++};
++
++/*
++ * For reserving space in the journal prior to getting a reservation on a
++ * particular journal entry:
++ */
++struct journal_preres {
++	unsigned		u64s;
++};
++
++union journal_res_state {
++	struct {
++		atomic64_t	counter;
++	};
++
++	struct {
++		u64		v;
++	};
++
++	struct {
++		u64		cur_entry_offset:20,
++				idx:2,
++				unwritten_idx:2,
++				buf0_count:10,
++				buf1_count:10,
++				buf2_count:10,
++				buf3_count:10;
++	};
++};
++
++union journal_preres_state {
++	struct {
++		atomic64_t	counter;
++	};
++
++	struct {
++		u64		v;
++	};
++
++	struct {
++		u64		waiting:1,
++				reserved:31,
++				remaining:32;
++	};
++};
++
++/* bytes: */
++#define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
++#define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
++
++/*
++ * We stash some journal state as sentinal values in cur_entry_offset:
++ * note - cur_entry_offset is in units of u64s
++ */
++#define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
++
++#define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
++#define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
++
++struct journal_space {
++	/* Units of 512 bytes sectors: */
++	unsigned	next_entry; /* How big the next journal entry can be */
++	unsigned	total;
++};
++
++enum journal_space_from {
++	journal_space_discarded,
++	journal_space_clean_ondisk,
++	journal_space_clean,
++	journal_space_total,
++	journal_space_nr,
++};
++
++enum {
++	JOURNAL_REPLAY_DONE,
++	JOURNAL_STARTED,
++	JOURNAL_MAY_SKIP_FLUSH,
++};
++
++#define JOURNAL_WATERMARKS()		\
++	x(any)				\
++	x(copygc)			\
++	x(reserved)
++
++enum journal_watermark {
++#define x(n)	JOURNAL_WATERMARK_##n,
++	JOURNAL_WATERMARKS()
++#undef x
++};
++
++#define JOURNAL_WATERMARK_MASK	3
++
++/* Reasons we may fail to get a journal reservation: */
++#define JOURNAL_ERRORS()		\
++	x(ok)				\
++	x(blocked)			\
++	x(max_in_flight)		\
++	x(journal_full)			\
++	x(journal_pin_full)		\
++	x(journal_stuck)		\
++	x(insufficient_devices)
++
++enum journal_errors {
++#define x(n)	JOURNAL_ERR_##n,
++	JOURNAL_ERRORS()
++#undef x
++};
++
++/* Embedded in struct bch_fs */
++struct journal {
++	/* Fastpath stuff up front: */
++
++	unsigned long		flags;
++
++	union journal_res_state reservations;
++	enum journal_watermark	watermark;
++
++	/* Max size of current journal entry */
++	unsigned		cur_entry_u64s;
++	unsigned		cur_entry_sectors;
++
++	/*
++	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
++	 * insufficient devices:
++	 */
++	enum journal_errors	cur_entry_error;
++
++	union journal_preres_state prereserved;
++
++	/* Reserved space in journal entry to be used just prior to write */
++	unsigned		entry_u64s_reserved;
++
++	unsigned		buf_size_want;
++
++	/*
++	 * Two journal entries -- one is currently open for new entries, the
++	 * other is possibly being written out.
++	 */
++	struct journal_buf	buf[JOURNAL_BUF_NR];
++
++	spinlock_t		lock;
++
++	/* if nonzero, we may not open a new journal entry: */
++	unsigned		blocked;
++
++	/* Used when waiting because the journal was full */
++	wait_queue_head_t	wait;
++	struct closure_waitlist	async_wait;
++	struct closure_waitlist	preres_wait;
++
++	struct closure		io;
++	struct delayed_work	write_work;
++
++	/* Sequence number of most recent journal entry (last entry in @pin) */
++	atomic64_t		seq;
++
++	/* seq, last_seq from the most recent journal entry successfully written */
++	u64			seq_ondisk;
++	u64			flushed_seq_ondisk;
++	u64			last_seq_ondisk;
++	u64			err_seq;
++	u64			last_empty_seq;
++
++	/*
++	 * FIFO of journal entries whose btree updates have not yet been
++	 * written out.
++	 *
++	 * Each entry is a reference count. The position in the FIFO is the
++	 * entry's sequence number relative to @seq.
++	 *
++	 * The journal entry itself holds a reference count, put when the
++	 * journal entry is written out. Each btree node modified by the journal
++	 * entry also holds a reference count, put when the btree node is
++	 * written.
++	 *
++	 * When a reference count reaches zero, the journal entry is no longer
++	 * needed. When all journal entries in the oldest journal bucket are no
++	 * longer needed, the bucket can be discarded and reused.
++	 */
++	struct {
++		u64 front, back, size, mask;
++		struct journal_entry_pin_list *data;
++	}			pin;
++
++	struct journal_space	space[journal_space_nr];
++
++	u64			replay_journal_seq;
++	u64			replay_journal_seq_end;
++
++	struct write_point	wp;
++	spinlock_t		err_lock;
++
++	struct mutex		reclaim_lock;
++	/*
++	 * Used for waiting until journal reclaim has freed up space in the
++	 * journal:
++	 */
++	wait_queue_head_t	reclaim_wait;
++	struct task_struct	*reclaim_thread;
++	bool			reclaim_kicked;
++	unsigned long		next_reclaim;
++	u64			nr_direct_reclaim;
++	u64			nr_background_reclaim;
++
++	unsigned long		last_flushed;
++	struct journal_entry_pin *flush_in_progress;
++	bool			flush_in_progress_dropped;
++	wait_queue_head_t	pin_flush_wait;
++
++	/* protects advancing ja->discard_idx: */
++	struct mutex		discard_lock;
++	bool			can_discard;
++
++	unsigned long		last_flush_write;
++
++	u64			res_get_blocked_start;
++	u64			write_start_time;
++
++	u64			nr_flush_writes;
++	u64			nr_noflush_writes;
++
++	struct time_stats	*flush_write_time;
++	struct time_stats	*noflush_write_time;
++	struct time_stats	*blocked_time;
++	struct time_stats	*flush_seq_time;
++
++#ifdef CONFIG_DEBUG_LOCK_ALLOC
++	struct lockdep_map	res_map;
++#endif
++};
++
++/*
++ * Embedded in struct bch_dev. First three fields refer to the array of journal
++ * buckets, in bch_sb.
++ */
++struct journal_device {
++	/*
++	 * For each journal bucket, contains the max sequence number of the
++	 * journal writes it contains - so we know when a bucket can be reused.
++	 */
++	u64			*bucket_seq;
++
++	unsigned		sectors_free;
++
++	/*
++	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
++	 */
++	unsigned		discard_idx;		/* Next bucket to discard */
++	unsigned		dirty_idx_ondisk;
++	unsigned		dirty_idx;
++	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
++	unsigned		nr;
++
++	u64			*buckets;
++
++	/* Bio for journal reads/writes to this device */
++	struct bio		*bio;
++
++	/* for bch_journal_read_device */
++	struct closure		read;
++};
++
++/*
++ * journal_entry_res - reserve space in every journal entry:
++ */
++struct journal_entry_res {
++	unsigned		u64s;
++};
++
++#endif /* _BCACHEFS_JOURNAL_TYPES_H */
+diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
+new file mode 100644
+index 000000000000..cda77835b9ea
+--- /dev/null
++++ b/fs/bcachefs/keylist.c
+@@ -0,0 +1,67 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "keylist.h"
++
++int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
++			size_t nr_inline_u64s, size_t new_u64s)
++{
++	size_t oldsize = bch2_keylist_u64s(l);
++	size_t newsize = oldsize + new_u64s;
++	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
++	u64 *new_keys;
++
++	newsize = roundup_pow_of_two(newsize);
++
++	if (newsize <= nr_inline_u64s ||
++	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
++		return 0;
++
++	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
++	if (!new_keys)
++		return -ENOMEM;
++
++	if (!old_buf)
++		memcpy_u64s(new_keys, inline_u64s, oldsize);
++
++	l->keys_p = new_keys;
++	l->top_p = new_keys + oldsize;
++
++	return 0;
++}
++
++void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
++{
++	struct bkey_i *where;
++
++	for_each_keylist_key(l, where)
++		if (bkey_cmp(insert->k.p, where->k.p) < 0)
++			break;
++
++	memmove_u64s_up((u64 *) where + insert->k.u64s,
++			where,
++			((u64 *) l->top) - ((u64 *) where));
++
++	l->top_p += insert->k.u64s;
++	bkey_copy(where, insert);
++}
++
++void bch2_keylist_pop_front(struct keylist *l)
++{
++	l->top_p -= bch2_keylist_front(l)->k.u64s;
++
++	memmove_u64s_down(l->keys,
++			  bkey_next(l->keys),
++			  bch2_keylist_u64s(l));
++}
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++void bch2_verify_keylist_sorted(struct keylist *l)
++{
++	struct bkey_i *k;
++
++	for_each_keylist_key(l, k)
++		BUG_ON(bkey_next(k) != l->top &&
++		       bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
++}
++#endif
+diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
+new file mode 100644
+index 000000000000..195799bb20bc
+--- /dev/null
++++ b/fs/bcachefs/keylist.h
+@@ -0,0 +1,76 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_KEYLIST_H
++#define _BCACHEFS_KEYLIST_H
++
++#include "keylist_types.h"
++
++int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
++void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
++void bch2_keylist_pop_front(struct keylist *);
++
++static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
++{
++	l->top_p = l->keys_p = inline_keys;
++}
++
++static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
++{
++	if (l->keys_p != inline_keys)
++		kfree(l->keys_p);
++	bch2_keylist_init(l, inline_keys);
++}
++
++static inline void bch2_keylist_push(struct keylist *l)
++{
++	l->top = bkey_next(l->top);
++}
++
++static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
++{
++	bkey_copy(l->top, k);
++	bch2_keylist_push(l);
++}
++
++static inline bool bch2_keylist_empty(struct keylist *l)
++{
++	return l->top == l->keys;
++}
++
++static inline size_t bch2_keylist_u64s(struct keylist *l)
++{
++	return l->top_p - l->keys_p;
++}
++
++static inline size_t bch2_keylist_bytes(struct keylist *l)
++{
++	return bch2_keylist_u64s(l) * sizeof(u64);
++}
++
++static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
++{
++	return l->keys;
++}
++
++#define for_each_keylist_key(_keylist, _k)			\
++	for (_k = (_keylist)->keys;				\
++	     _k != (_keylist)->top;				\
++	     _k = bkey_next(_k))
++
++static inline u64 keylist_sectors(struct keylist *keys)
++{
++	struct bkey_i *k;
++	u64 ret = 0;
++
++	for_each_keylist_key(keys, k)
++		ret += k->k.size;
++
++	return ret;
++}
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++void bch2_verify_keylist_sorted(struct keylist *);
++#else
++static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
++#endif
++
++#endif /* _BCACHEFS_KEYLIST_H */
+diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
+new file mode 100644
+index 000000000000..4b3ff7d8a875
+--- /dev/null
++++ b/fs/bcachefs/keylist_types.h
+@@ -0,0 +1,16 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_KEYLIST_TYPES_H
++#define _BCACHEFS_KEYLIST_TYPES_H
++
++struct keylist {
++	union {
++		struct bkey_i		*keys;
++		u64			*keys_p;
++	};
++	union {
++		struct bkey_i		*top;
++		u64			*top_p;
++	};
++};
++
++#endif /* _BCACHEFS_KEYLIST_TYPES_H */
+diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
+new file mode 100644
+index 000000000000..53e607d72274
+--- /dev/null
++++ b/fs/bcachefs/lru.c
+@@ -0,0 +1,206 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "btree_iter.h"
++#include "btree_update.h"
++#include "error.h"
++#include "lru.h"
++#include "recovery.h"
++
++int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
++		     int rw, struct printbuf *err)
++{
++	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
++
++	if (bkey_val_bytes(k.k) < sizeof(*lru)) {
++		prt_printf(err, "incorrect value size (%zu < %zu)",
++		       bkey_val_bytes(k.k), sizeof(*lru));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
++		      struct bkey_s_c k)
++{
++	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
++
++	prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
++}
++
++int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time,
++		    struct bkey_s_c orig_k)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u64 existing_idx;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
++
++	if (!time)
++		return 0;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
++			     POS(id, time),
++			     BTREE_ITER_INTENT|
++			     BTREE_ITER_WITH_UPDATES);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_lru) {
++		bch2_bkey_val_to_text(&buf, trans->c, orig_k);
++		bch2_trans_inconsistent(trans,
++			"pointer to nonexistent lru %llu:%llu\n%s",
++			id, time, buf.buf);
++		ret = -EIO;
++		goto err;
++	}
++
++	existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
++	if (existing_idx != idx) {
++		bch2_bkey_val_to_text(&buf, trans->c, orig_k);
++		bch2_trans_inconsistent(trans,
++			"lru %llu:%llu with wrong backpointer: got %llu, should be %llu\n%s",
++			id, time, existing_idx, idx, buf.buf);
++		ret = -EIO;
++		goto err;
++	}
++
++	ret = bch2_btree_delete_at(trans, &iter, 0);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i_lru *lru;
++	int ret = 0;
++
++	if (!*time)
++		return 0;
++
++	for_each_btree_key_norestart(trans, iter, BTREE_ID_lru,
++			POS(lru_id, *time),
++			BTREE_ITER_SLOTS|
++			BTREE_ITER_INTENT|
++			BTREE_ITER_WITH_UPDATES, k, ret)
++		if (bkey_deleted(k.k))
++			break;
++
++	if (ret)
++		goto err;
++
++	BUG_ON(iter.pos.inode != lru_id);
++	*time = iter.pos.offset;
++
++	lru = bch2_trans_kmalloc(trans, sizeof(*lru));
++	ret = PTR_ERR_OR_ZERO(lru);
++	if (ret)
++		goto err;
++
++	bkey_lru_init(&lru->k_i);
++	lru->k.p	= iter.pos;
++	lru->v.idx	= cpu_to_le64(idx);
++
++	ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
++	if (ret)
++		goto err;
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
++		    u64 old_time, u64 *new_time,
++		    struct bkey_s_c k)
++{
++	if (old_time == *new_time)
++		return 0;
++
++	return  bch2_lru_delete(trans, id, idx, old_time, k) ?:
++		bch2_lru_set(trans, id, idx, new_time);
++}
++
++static int bch2_check_lru_key(struct btree_trans *trans,
++			      struct btree_iter *lru_iter,
++			      struct bkey_s_c lru_k)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_alloc_v4 a;
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++	struct bpos alloc_pos;
++	int ret;
++
++	alloc_pos = POS(lru_k.k->p.inode,
++			le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx));
++
++	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
++			"lru key points to nonexistent device:bucket %llu:%llu",
++			alloc_pos.inode, alloc_pos.offset))
++		return bch2_btree_delete_at(trans, lru_iter, 0);
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	bch2_alloc_to_v4(k, &a);
++
++	if (fsck_err_on(a.data_type != BCH_DATA_cached ||
++			a.io_time[READ] != lru_k.k->p.offset, c,
++			"incorrect lru entry %s\n"
++			"  for %s",
++			(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
++			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
++		struct bkey_i *update =
++			bch2_trans_kmalloc(trans, sizeof(*update));
++
++		ret = PTR_ERR_OR_ZERO(update);
++		if (ret)
++			goto err;
++
++		bkey_init(&update->k);
++		update->k.p = lru_iter->pos;
++
++		ret = bch2_trans_update(trans, lru_iter, update, 0);
++		if (ret)
++			goto err;
++	}
++err:
++fsck_err:
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
++	return ret;
++}
++
++int bch2_check_lrus(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
++		bch2_check_lru_key(&trans, &iter, k));
++
++	bch2_trans_exit(&trans);
++	return ret;
++
++}
+diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
+new file mode 100644
+index 000000000000..3decb7b1dde2
+--- /dev/null
++++ b/fs/bcachefs/lru.h
+@@ -0,0 +1,19 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_LRU_H
++#define _BCACHEFS_LRU_H
++
++int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++
++#define bch2_bkey_ops_lru (struct bkey_ops) {	\
++	.key_invalid	= bch2_lru_invalid,	\
++	.val_to_text	= bch2_lru_to_text,	\
++}
++
++int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c);
++int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
++int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c);
++
++int bch2_check_lrus(struct bch_fs *);
++
++#endif /* _BCACHEFS_LRU_H */
+diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
+new file mode 100644
+index 000000000000..8b258d966d04
+--- /dev/null
++++ b/fs/bcachefs/migrate.c
+@@ -0,0 +1,186 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Code for moving data off a device.
++ */
++
++#include "bcachefs.h"
++#include "bkey_buf.h"
++#include "btree_update.h"
++#include "btree_update_interior.h"
++#include "buckets.h"
++#include "errcode.h"
++#include "extents.h"
++#include "io.h"
++#include "journal.h"
++#include "keylist.h"
++#include "migrate.h"
++#include "move.h"
++#include "replicas.h"
++#include "super-io.h"
++
++static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
++			 unsigned dev_idx, int flags, bool metadata)
++{
++	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
++	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
++	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
++	unsigned nr_good;
++
++	bch2_bkey_drop_device(k, dev_idx);
++
++	nr_good = bch2_bkey_durability(c, k.s_c);
++	if ((!nr_good && !(flags & lost)) ||
++	    (nr_good < replicas && !(flags & degraded)))
++		return -EINVAL;
++
++	return 0;
++}
++
++static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
++				     struct btree_iter *iter,
++				     struct bkey_s_c k,
++				     unsigned dev_idx,
++				     int flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_i *n;
++	int ret;
++
++	if (!bch2_bkey_has_device(k, dev_idx))
++		return 0;
++
++	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(n);
++	if (ret)
++		return ret;
++
++	bkey_reassemble(n, k);
++
++	ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
++	if (ret)
++		return ret;
++
++	/*
++	 * If the new extent no longer has any pointers, bch2_extent_normalize()
++	 * will do the appropriate thing with it (turning it into a
++	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
++	 */
++	bch2_extent_normalize(c, bkey_i_to_s(n));
++
++	/*
++	 * Since we're not inserting through an extent iterator
++	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
++	 * we aren't using the extent overwrite path to delete, we're
++	 * just using the normal key deletion path:
++	 */
++	if (bkey_deleted(&n->k))
++		n->k.size = 0;
++
++	return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++}
++
++static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	enum btree_id id;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++
++	for (id = 0; id < BTREE_ID_NR; id++) {
++		if (!btree_type_has_ptrs(id))
++			continue;
++
++		ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
++				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++				NULL, NULL, BTREE_INSERT_NOFAIL,
++			bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
++		if (ret)
++			break;
++	}
++
++	bch2_trans_exit(&trans);
++
++	return ret;
++}
++
++static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct closure cl;
++	struct btree *b;
++	struct bkey_buf k;
++	unsigned id;
++	int ret;
++
++	/* don't handle this yet: */
++	if (flags & BCH_FORCE_IF_METADATA_LOST)
++		return -EINVAL;
++
++	bch2_bkey_buf_init(&k);
++	bch2_trans_init(&trans, c, 0, 0);
++	closure_init_stack(&cl);
++
++	for (id = 0; id < BTREE_ID_NR; id++) {
++		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
++					  BTREE_ITER_PREFETCH);
++retry:
++		ret = 0;
++		while (bch2_trans_begin(&trans),
++		       (b = bch2_btree_iter_peek_node(&iter)) &&
++		       !(ret = PTR_ERR_OR_ZERO(b))) {
++			if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
++						  dev_idx))
++				goto next;
++
++			bch2_bkey_buf_copy(&k, c, &b->key);
++
++			ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
++					    dev_idx, flags, true);
++			if (ret) {
++				bch_err(c, "Cannot drop device without losing data");
++				break;
++			}
++
++			ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
++				ret = 0;
++				continue;
++			}
++
++			if (ret) {
++				bch_err(c, "Error updating btree node key: %s",
++					bch2_err_str(ret));
++				break;
++			}
++next:
++			bch2_btree_iter_next_node(&iter);
++		}
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			goto retry;
++
++		bch2_trans_iter_exit(&trans, &iter);
++
++		if (ret)
++			goto err;
++	}
++
++	bch2_btree_interior_updates_flush(c);
++	ret = 0;
++err:
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&k, c);
++
++	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
++
++	return ret;
++}
++
++int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
++{
++	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
++		bch2_dev_metadata_drop(c, dev_idx, flags);
++}
+diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
+new file mode 100644
+index 000000000000..027efaa0d575
+--- /dev/null
++++ b/fs/bcachefs/migrate.h
+@@ -0,0 +1,7 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_MIGRATE_H
++#define _BCACHEFS_MIGRATE_H
++
++int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
++
++#endif /* _BCACHEFS_MIGRATE_H */
+diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
+new file mode 100644
+index 000000000000..2fc247451390
+--- /dev/null
++++ b/fs/bcachefs/move.c
+@@ -0,0 +1,952 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "alloc_foreground.h"
++#include "backpointers.h"
++#include "bkey_buf.h"
++#include "btree_gc.h"
++#include "btree_update.h"
++#include "btree_update_interior.h"
++#include "disk_groups.h"
++#include "ec.h"
++#include "errcode.h"
++#include "error.h"
++#include "inode.h"
++#include "io.h"
++#include "journal_reclaim.h"
++#include "move.h"
++#include "replicas.h"
++#include "super-io.h"
++#include "keylist.h"
++
++#include <linux/ioprio.h>
++#include <linux/kthread.h>
++
++#include <trace/events/bcachefs.h>
++
++static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
++{
++	mutex_lock(&c->data_progress_lock);
++	list_add(&stats->list, &c->data_progress_list);
++	mutex_unlock(&c->data_progress_lock);
++}
++
++static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
++{
++	mutex_lock(&c->data_progress_lock);
++	list_del(&stats->list);
++	mutex_unlock(&c->data_progress_lock);
++}
++
++struct moving_io {
++	struct list_head	list;
++	struct closure		cl;
++	bool			read_completed;
++
++	unsigned		read_sectors;
++	unsigned		write_sectors;
++
++	struct bch_read_bio	rbio;
++
++	struct data_update	write;
++	/* Must be last since it is variable size */
++	struct bio_vec		bi_inline_vecs[0];
++};
++
++static void move_free(struct closure *cl)
++{
++	struct moving_io *io = container_of(cl, struct moving_io, cl);
++	struct moving_context *ctxt = io->write.ctxt;
++	struct bch_fs *c = ctxt->c;
++
++	bch2_data_update_exit(&io->write);
++	wake_up(&ctxt->wait);
++	percpu_ref_put(&c->writes);
++	kfree(io);
++}
++
++static void move_write_done(struct closure *cl)
++{
++	struct moving_io *io = container_of(cl, struct moving_io, cl);
++	struct moving_context *ctxt = io->write.ctxt;
++
++	if (io->write.op.error)
++		ctxt->write_error = true;
++
++	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
++	closure_return_with_destructor(cl, move_free);
++}
++
++static void move_write(struct closure *cl)
++{
++	struct moving_io *io = container_of(cl, struct moving_io, cl);
++
++	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
++		closure_return_with_destructor(cl, move_free);
++		return;
++	}
++
++	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
++
++	bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl);
++	continue_at(cl, move_write_done, NULL);
++}
++
++static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
++{
++	struct moving_io *io =
++		list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
++
++	return io && io->read_completed ? io : NULL;
++}
++
++static void move_read_endio(struct bio *bio)
++{
++	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
++	struct moving_context *ctxt = io->write.ctxt;
++
++	atomic_sub(io->read_sectors, &ctxt->read_sectors);
++	io->read_completed = true;
++
++	wake_up(&ctxt->wait);
++	closure_put(&ctxt->cl);
++}
++
++static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
++{
++	struct moving_io *io;
++
++	if (trans)
++		bch2_trans_unlock(trans);
++
++	while ((io = next_pending_write(ctxt))) {
++		list_del(&io->list);
++		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
++	}
++}
++
++#define move_ctxt_wait_event(_ctxt, _trans, _cond)		\
++do {								\
++	do_pending_writes(_ctxt, _trans);			\
++								\
++	if (_cond)						\
++		break;						\
++	__wait_event((_ctxt)->wait,				\
++		     next_pending_write(_ctxt) || (_cond));	\
++} while (1)
++
++static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
++				       struct btree_trans *trans)
++{
++	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
++
++	move_ctxt_wait_event(ctxt, trans,
++		!atomic_read(&ctxt->write_sectors) ||
++		atomic_read(&ctxt->write_sectors) != sectors_pending);
++}
++
++void bch2_moving_ctxt_exit(struct moving_context *ctxt)
++{
++	move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
++	closure_sync(&ctxt->cl);
++	EBUG_ON(atomic_read(&ctxt->write_sectors));
++
++	if (ctxt->stats) {
++		progress_list_del(ctxt->c, ctxt->stats);
++
++		trace_move_data(ctxt->c,
++				atomic64_read(&ctxt->stats->sectors_moved),
++				atomic64_read(&ctxt->stats->keys_moved));
++	}
++}
++
++void bch2_moving_ctxt_init(struct moving_context *ctxt,
++			   struct bch_fs *c,
++			   struct bch_ratelimit *rate,
++			   struct bch_move_stats *stats,
++			   struct write_point_specifier wp,
++			   bool wait_on_copygc)
++{
++	memset(ctxt, 0, sizeof(*ctxt));
++
++	ctxt->c		= c;
++	ctxt->rate	= rate;
++	ctxt->stats	= stats;
++	ctxt->wp	= wp;
++	ctxt->wait_on_copygc = wait_on_copygc;
++
++	closure_init_stack(&ctxt->cl);
++	INIT_LIST_HEAD(&ctxt->reads);
++	init_waitqueue_head(&ctxt->wait);
++
++	if (stats) {
++		progress_list_add(c, stats);
++		stats->data_type = BCH_DATA_user;
++	}
++}
++
++void bch_move_stats_init(struct bch_move_stats *stats, char *name)
++{
++	memset(stats, 0, sizeof(*stats));
++	scnprintf(stats->name, sizeof(stats->name), "%s", name);
++}
++
++static int bch2_move_extent(struct btree_trans *trans,
++			    struct moving_context *ctxt,
++			    struct bch_io_opts io_opts,
++			    enum btree_id btree_id,
++			    struct bkey_s_c k,
++			    struct data_update_opts data_opts)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	struct moving_io *io;
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++	unsigned sectors = k.k->size, pages;
++	int ret = -ENOMEM;
++
++	if (!percpu_ref_tryget_live(&c->writes))
++		return -EROFS;
++
++	/* write path might have to decompress data: */
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
++		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
++
++	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
++	io = kzalloc(sizeof(struct moving_io) +
++		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
++	if (!io)
++		goto err;
++
++	io->write.ctxt		= ctxt;
++	io->read_sectors	= k.k->size;
++	io->write_sectors	= k.k->size;
++
++	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
++	bio_set_prio(&io->write.op.wbio.bio,
++		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
++
++	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
++				 GFP_KERNEL))
++		goto err_free;
++
++	io->rbio.c		= c;
++	io->rbio.opts		= io_opts;
++	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
++	io->rbio.bio.bi_vcnt = pages;
++	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
++	io->rbio.bio.bi_iter.bi_size = sectors << 9;
++
++	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
++	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
++	io->rbio.bio.bi_end_io		= move_read_endio;
++
++	ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts,
++				    data_opts, btree_id, k);
++	if (ret)
++		goto err_free_pages;
++
++	io->write.ctxt = ctxt;
++
++	atomic64_inc(&ctxt->stats->keys_moved);
++	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
++	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
++
++	trace_move_extent(k.k);
++
++	atomic_add(io->read_sectors, &ctxt->read_sectors);
++	list_add_tail(&io->list, &ctxt->reads);
++
++	/*
++	 * dropped by move_read_endio() - guards against use after free of
++	 * ctxt when doing wakeup
++	 */
++	closure_get(&ctxt->cl);
++	bch2_read_extent(trans, &io->rbio,
++			 bkey_start_pos(k.k),
++			 btree_id, k, 0,
++			 BCH_READ_NODECODE|
++			 BCH_READ_LAST_FRAGMENT);
++	return 0;
++err_free_pages:
++	bio_free_pages(&io->write.op.wbio.bio);
++err_free:
++	kfree(io);
++err:
++	percpu_ref_put(&c->writes);
++	trace_move_alloc_mem_fail(k.k);
++	return ret;
++}
++
++static int lookup_inode(struct btree_trans *trans, struct bpos pos,
++			struct bch_inode_unpacked *inode)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
++			     BTREE_ITER_ALL_SNAPSHOTS);
++	k = bch2_btree_iter_peek(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (!k.k || bkey_cmp(k.k->p, pos)) {
++		ret = -ENOENT;
++		goto err;
++	}
++
++	ret = bkey_is_inode(k.k) ? 0 : -EIO;
++	if (ret)
++		goto err;
++
++	ret = bch2_inode_unpack(k, inode);
++	if (ret)
++		goto err;
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int move_ratelimit(struct btree_trans *trans,
++			  struct moving_context *ctxt)
++{
++	struct bch_fs *c = trans->c;
++	u64 delay;
++
++	if (ctxt->wait_on_copygc) {
++		bch2_trans_unlock(trans);
++		wait_event_killable(c->copygc_running_wq,
++				    !c->copygc_running ||
++				    kthread_should_stop());
++	}
++
++	do {
++		delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
++
++		if (delay) {
++			bch2_trans_unlock(trans);
++			set_current_state(TASK_INTERRUPTIBLE);
++		}
++
++		if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
++			__set_current_state(TASK_RUNNING);
++			return 1;
++		}
++
++		if (delay)
++			schedule_timeout(delay);
++
++		if (unlikely(freezing(current))) {
++			move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
++			try_to_freeze();
++		}
++	} while (delay);
++
++	move_ctxt_wait_event(ctxt, trans,
++		atomic_read(&ctxt->write_sectors) <
++		c->opts.move_bytes_in_flight >> 9);
++
++	move_ctxt_wait_event(ctxt, trans,
++		atomic_read(&ctxt->read_sectors) <
++		c->opts.move_bytes_in_flight >> 9);
++
++	return 0;
++}
++
++static int move_get_io_opts(struct btree_trans *trans,
++			    struct bch_io_opts *io_opts,
++			    struct bkey_s_c k, u64 *cur_inum)
++{
++	struct bch_inode_unpacked inode;
++	int ret;
++
++	if (*cur_inum == k.k->p.inode)
++		return 0;
++
++	*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
++
++	ret = lookup_inode(trans,
++			   SPOS(0, k.k->p.inode, k.k->p.snapshot),
++			   &inode);
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		return ret;
++
++	if (!ret)
++		bch2_io_opts_apply(io_opts, bch2_inode_opts_get(&inode));
++
++	*cur_inum = k.k->p.inode;
++	return 0;
++}
++
++static int __bch2_move_data(struct moving_context *ctxt,
++			    struct bpos start,
++			    struct bpos end,
++			    move_pred_fn pred, void *arg,
++			    enum btree_id btree_id)
++{
++	struct bch_fs *c = ctxt->c;
++	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
++	struct bkey_buf sk;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct data_update_opts data_opts;
++	u64 cur_inum = U64_MAX;
++	int ret = 0, ret2;
++
++	bch2_bkey_buf_init(&sk);
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ctxt->stats->data_type	= BCH_DATA_user;
++	ctxt->stats->btree_id	= btree_id;
++	ctxt->stats->pos	= start;
++
++	bch2_trans_iter_init(&trans, &iter, btree_id, start,
++			     BTREE_ITER_PREFETCH|
++			     BTREE_ITER_ALL_SNAPSHOTS);
++
++	if (ctxt->rate)
++		bch2_ratelimit_reset(ctxt->rate);
++
++	while (!move_ratelimit(&trans, ctxt)) {
++		bch2_trans_begin(&trans);
++
++		k = bch2_btree_iter_peek(&iter);
++		if (!k.k)
++			break;
++
++		ret = bkey_err(k);
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			continue;
++		if (ret)
++			break;
++
++		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
++			break;
++
++		ctxt->stats->pos = iter.pos;
++
++		if (!bkey_extent_is_direct_data(k.k))
++			goto next_nondata;
++
++		ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
++		if (ret)
++			continue;
++
++		memset(&data_opts, 0, sizeof(data_opts));
++		if (!pred(c, arg, k, &io_opts, &data_opts))
++			goto next;
++
++		/*
++		 * The iterator gets unlocked by __bch2_read_extent - need to
++		 * save a copy of @k elsewhere:
++		  */
++		bch2_bkey_buf_reassemble(&sk, c, k);
++		k = bkey_i_to_s_c(sk.k);
++
++		ret2 = bch2_move_extent(&trans, ctxt, io_opts,
++					btree_id, k, data_opts);
++		if (ret2) {
++			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
++				continue;
++
++			if (ret2 == -ENOMEM) {
++				/* memory allocation failure, wait for some IO to finish */
++				bch2_move_ctxt_wait_for_io(ctxt, &trans);
++				continue;
++			}
++
++			/* XXX signal failure */
++			goto next;
++		}
++
++		if (ctxt->rate)
++			bch2_ratelimit_increment(ctxt->rate, k.k->size);
++next:
++		atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
++next_nondata:
++		bch2_btree_iter_advance(&iter);
++	}
++
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&sk, c);
++
++	return ret;
++}
++
++int bch2_move_data(struct bch_fs *c,
++		   enum btree_id start_btree_id, struct bpos start_pos,
++		   enum btree_id end_btree_id,   struct bpos end_pos,
++		   struct bch_ratelimit *rate,
++		   struct bch_move_stats *stats,
++		   struct write_point_specifier wp,
++		   bool wait_on_copygc,
++		   move_pred_fn pred, void *arg)
++{
++	struct moving_context ctxt;
++	enum btree_id id;
++	int ret;
++
++	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
++
++	for (id = start_btree_id;
++	     id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
++	     id++) {
++		stats->btree_id = id;
++
++		if (id != BTREE_ID_extents &&
++		    id != BTREE_ID_reflink)
++			continue;
++
++		ret = __bch2_move_data(&ctxt,
++				       id == start_btree_id ? start_pos : POS_MIN,
++				       id == end_btree_id   ? end_pos   : POS_MAX,
++				       pred, arg, id);
++		if (ret)
++			break;
++	}
++
++	bch2_moving_ctxt_exit(&ctxt);
++
++	return ret;
++}
++
++static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
++			     bucket, BTREE_ITER_CACHED);
++again:
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++
++	if (!ret && k.k->type == KEY_TYPE_alloc_v4) {
++		struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
++
++		if (a.v->gen == gen &&
++		    a.v->dirty_sectors) {
++			struct printbuf buf = PRINTBUF;
++
++			if (a.v->data_type == BCH_DATA_btree) {
++				bch2_trans_unlock(trans);
++				if (bch2_btree_interior_updates_flush(c))
++					goto again;
++			}
++
++			prt_str(&buf, "failed to evacuate bucket ");
++			bch2_bkey_val_to_text(&buf, c, k);
++
++			bch2_trans_inconsistent(trans, "%s", buf.buf);
++			printbuf_exit(&buf);
++		}
++	}
++
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int __bch2_evacuate_bucket(struct moving_context *ctxt,
++			   struct bpos bucket, int gen,
++			   struct data_update_opts _data_opts)
++{
++	struct bch_fs *c = ctxt->c;
++	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_buf sk;
++	struct bch_backpointer bp;
++	struct data_update_opts data_opts;
++	u64 bp_offset = 0, cur_inum = U64_MAX;
++	int ret = 0;
++
++	bch2_bkey_buf_init(&sk);
++	bch2_trans_init(&trans, c, 0, 0);
++
++	while (!(ret = move_ratelimit(&trans, ctxt))) {
++		bch2_trans_begin(&trans);
++
++		ret = bch2_get_next_backpointer(&trans, bucket, gen,
++						&bp_offset, &bp);
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			continue;
++		if (ret)
++			goto err;
++		if (bp_offset == U64_MAX)
++			break;
++
++		if (!bp.level) {
++			const struct bch_extent_ptr *ptr;
++			struct bkey_s_c k;
++			unsigned i = 0;
++
++			k = bch2_backpointer_get_key(&trans, &iter,
++						bucket, bp_offset, bp);
++			ret = bkey_err(k);
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				continue;
++			if (ret)
++				goto err;
++			if (!k.k)
++				continue;
++
++			bch2_bkey_buf_reassemble(&sk, c, k);
++			k = bkey_i_to_s_c(sk.k);
++			bch2_trans_iter_exit(&trans, &iter);
++
++			ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
++			if (ret)
++				continue;
++
++			data_opts = _data_opts;
++			data_opts.target	= io_opts.background_target;
++			data_opts.rewrite_ptrs = 0;
++
++			bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
++				if (ptr->dev == bucket.inode)
++					data_opts.rewrite_ptrs |= 1U << i;
++				i++;
++			}
++
++			ret = bch2_move_extent(&trans, ctxt, io_opts,
++					       bp.btree_id, k, data_opts);
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				continue;
++			if (ret == -ENOMEM) {
++				/* memory allocation failure, wait for some IO to finish */
++				bch2_move_ctxt_wait_for_io(ctxt, &trans);
++				continue;
++			}
++			if (ret)
++				goto err;
++
++			if (ctxt->rate)
++				bch2_ratelimit_increment(ctxt->rate, k.k->size);
++			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
++		} else {
++			struct btree *b;
++
++			b = bch2_backpointer_get_node(&trans, &iter,
++						bucket, bp_offset, bp);
++			ret = PTR_ERR_OR_ZERO(b);
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				continue;
++			if (ret)
++				goto err;
++			if (!b)
++				continue;
++
++			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
++			bch2_trans_iter_exit(&trans, &iter);
++
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				continue;
++			if (ret)
++				goto err;
++
++			if (ctxt->rate)
++				bch2_ratelimit_increment(ctxt->rate,
++							 c->opts.btree_node_size >> 9);
++			atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
++			atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
++		}
++
++		bp_offset++;
++	}
++
++	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
++		bch2_trans_unlock(&trans);
++		move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
++		closure_sync(&ctxt->cl);
++		if (!ctxt->write_error)
++			lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen));
++	}
++err:
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&sk, c);
++	return ret;
++}
++
++int bch2_evacuate_bucket(struct bch_fs *c,
++			 struct bpos bucket, int gen,
++			 struct data_update_opts data_opts,
++			 struct bch_ratelimit *rate,
++			 struct bch_move_stats *stats,
++			 struct write_point_specifier wp,
++			 bool wait_on_copygc)
++{
++	struct moving_context ctxt;
++	int ret;
++
++	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
++	ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts);
++	bch2_moving_ctxt_exit(&ctxt);
++
++	return ret;
++}
++
++typedef bool (*move_btree_pred)(struct bch_fs *, void *,
++				struct btree *, struct bch_io_opts *,
++				struct data_update_opts *);
++
++static int bch2_move_btree(struct bch_fs *c,
++			   enum btree_id start_btree_id, struct bpos start_pos,
++			   enum btree_id end_btree_id,   struct bpos end_pos,
++			   move_btree_pred pred, void *arg,
++			   struct bch_move_stats *stats)
++{
++	bool kthread = (current->flags & PF_KTHREAD) != 0;
++	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct btree *b;
++	enum btree_id id;
++	struct data_update_opts data_opts;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	progress_list_add(c, stats);
++
++	stats->data_type = BCH_DATA_btree;
++
++	for (id = start_btree_id;
++	     id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
++	     id++) {
++		stats->btree_id = id;
++
++		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
++					  BTREE_ITER_PREFETCH);
++retry:
++		ret = 0;
++		while (bch2_trans_begin(&trans),
++		       (b = bch2_btree_iter_peek_node(&iter)) &&
++		       !(ret = PTR_ERR_OR_ZERO(b))) {
++			if (kthread && kthread_should_stop())
++				break;
++
++			if ((cmp_int(id, end_btree_id) ?:
++			     bpos_cmp(b->key.k.p, end_pos)) > 0)
++				break;
++
++			stats->pos = iter.pos;
++
++			if (!pred(c, arg, b, &io_opts, &data_opts))
++				goto next;
++
++			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
++			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				continue;
++			if (ret)
++				break;
++next:
++			bch2_btree_iter_next_node(&iter);
++		}
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			goto retry;
++
++		bch2_trans_iter_exit(&trans, &iter);
++
++		if (kthread && kthread_should_stop())
++			break;
++	}
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
++
++	bch2_btree_interior_updates_flush(c);
++
++	progress_list_del(c, stats);
++	return ret;
++}
++
++static bool rereplicate_pred(struct bch_fs *c, void *arg,
++			     struct bkey_s_c k,
++			     struct bch_io_opts *io_opts,
++			     struct data_update_opts *data_opts)
++{
++	unsigned nr_good = bch2_bkey_durability(c, k);
++	unsigned replicas = bkey_is_btree_ptr(k.k)
++		? c->opts.metadata_replicas
++		: io_opts->data_replicas;
++
++	if (!nr_good || nr_good >= replicas)
++		return false;
++
++	data_opts->target		= 0;
++	data_opts->extra_replicas	= replicas - nr_good;
++	data_opts->btree_insert_flags	= 0;
++	return true;
++}
++
++static bool migrate_pred(struct bch_fs *c, void *arg,
++			 struct bkey_s_c k,
++			 struct bch_io_opts *io_opts,
++			 struct data_update_opts *data_opts)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr;
++	struct bch_ioctl_data *op = arg;
++	unsigned i = 0;
++
++	data_opts->rewrite_ptrs		= 0;
++	data_opts->target		= 0;
++	data_opts->extra_replicas	= 0;
++	data_opts->btree_insert_flags	= 0;
++
++	bkey_for_each_ptr(ptrs, ptr) {
++		if (ptr->dev == op->migrate.dev)
++			data_opts->rewrite_ptrs |= 1U << i;
++		i++;
++	}
++
++	return data_opts->rewrite_ptrs != 0;;
++}
++
++static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
++				   struct btree *b,
++				   struct bch_io_opts *io_opts,
++				   struct data_update_opts *data_opts)
++{
++	return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
++}
++
++static bool migrate_btree_pred(struct bch_fs *c, void *arg,
++			       struct btree *b,
++			       struct bch_io_opts *io_opts,
++			       struct data_update_opts *data_opts)
++{
++	return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
++}
++
++static bool bformat_needs_redo(struct bkey_format *f)
++{
++	unsigned i;
++
++	for (i = 0; i < f->nr_fields; i++) {
++		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
++		u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
++		u64 field_offset = le64_to_cpu(f->field_offset[i]);
++
++		if (f->bits_per_field[i] > unpacked_bits)
++			return true;
++
++		if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
++			return true;
++
++		if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
++		     unpacked_mask) <
++		    field_offset)
++			return true;
++	}
++
++	return false;
++}
++
++static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
++				   struct btree *b,
++				   struct bch_io_opts *io_opts,
++				   struct data_update_opts *data_opts)
++{
++	if (b->version_ondisk != c->sb.version ||
++	    btree_node_need_rewrite(b) ||
++	    bformat_needs_redo(&b->format)) {
++		data_opts->target		= 0;
++		data_opts->extra_replicas	= 0;
++		data_opts->btree_insert_flags	= 0;
++		return true;
++	}
++
++	return false;
++}
++
++int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
++{
++	int ret;
++
++	ret = bch2_move_btree(c,
++			      0,		POS_MIN,
++			      BTREE_ID_NR,	SPOS_MAX,
++			      rewrite_old_nodes_pred, c, stats);
++	if (!ret) {
++		mutex_lock(&c->sb_lock);
++		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
++		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
++		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
++		bch2_write_super(c);
++		mutex_unlock(&c->sb_lock);
++	}
++
++	return ret;
++}
++
++int bch2_data_job(struct bch_fs *c,
++		  struct bch_move_stats *stats,
++		  struct bch_ioctl_data op)
++{
++	int ret = 0;
++
++	switch (op.op) {
++	case BCH_DATA_OP_REREPLICATE:
++		bch_move_stats_init(stats, "rereplicate");
++		stats->data_type = BCH_DATA_journal;
++		ret = bch2_journal_flush_device_pins(&c->journal, -1);
++
++		ret = bch2_move_btree(c,
++				      op.start_btree,	op.start_pos,
++				      op.end_btree,	op.end_pos,
++				      rereplicate_btree_pred, c, stats) ?: ret;
++		ret = bch2_replicas_gc2(c) ?: ret;
++
++		ret = bch2_move_data(c,
++				     op.start_btree,	op.start_pos,
++				     op.end_btree,	op.end_pos,
++				     NULL,
++				     stats,
++				     writepoint_hashed((unsigned long) current),
++				     true,
++				     rereplicate_pred, c) ?: ret;
++		ret = bch2_replicas_gc2(c) ?: ret;
++		break;
++	case BCH_DATA_OP_MIGRATE:
++		if (op.migrate.dev >= c->sb.nr_devices)
++			return -EINVAL;
++
++		bch_move_stats_init(stats, "migrate");
++		stats->data_type = BCH_DATA_journal;
++		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
++
++		ret = bch2_move_btree(c,
++				      op.start_btree,	op.start_pos,
++				      op.end_btree,	op.end_pos,
++				      migrate_btree_pred, &op, stats) ?: ret;
++		ret = bch2_replicas_gc2(c) ?: ret;
++
++		ret = bch2_move_data(c,
++				     op.start_btree,	op.start_pos,
++				     op.end_btree,	op.end_pos,
++				     NULL,
++				     stats,
++				     writepoint_hashed((unsigned long) current),
++				     true,
++				     migrate_pred, &op) ?: ret;
++		ret = bch2_replicas_gc2(c) ?: ret;
++		break;
++	case BCH_DATA_OP_REWRITE_OLD_NODES:
++		bch_move_stats_init(stats, "rewrite_old_nodes");
++		ret = bch2_scan_old_btree_nodes(c, stats);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
+diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
+new file mode 100644
+index 000000000000..c0fec69bbb6a
+--- /dev/null
++++ b/fs/bcachefs/move.h
+@@ -0,0 +1,67 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_MOVE_H
++#define _BCACHEFS_MOVE_H
++
++#include "btree_iter.h"
++#include "buckets.h"
++#include "data_update.h"
++#include "move_types.h"
++
++struct bch_read_bio;
++
++struct moving_context {
++	struct bch_fs		*c;
++	struct bch_ratelimit	*rate;
++	struct bch_move_stats	*stats;
++	struct write_point_specifier wp;
++	bool			wait_on_copygc;
++	bool			write_error;
++
++	/* For waiting on outstanding reads and writes: */
++	struct closure		cl;
++	struct list_head	reads;
++
++	/* in flight sectors: */
++	atomic_t		read_sectors;
++	atomic_t		write_sectors;
++
++	wait_queue_head_t	wait;
++};
++
++typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
++			     struct bch_io_opts *, struct data_update_opts *);
++
++void bch2_moving_ctxt_exit(struct moving_context *);
++void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
++			   struct bch_ratelimit *, struct bch_move_stats *,
++			   struct write_point_specifier, bool);
++
++int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
++
++int bch2_move_data(struct bch_fs *,
++		   enum btree_id, struct bpos,
++		   enum btree_id, struct bpos,
++		   struct bch_ratelimit *,
++		   struct bch_move_stats *,
++		   struct write_point_specifier,
++		   bool,
++		   move_pred_fn, void *);
++
++int __bch2_evacuate_bucket(struct moving_context *,
++			   struct bpos, int,
++			   struct data_update_opts);
++int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
++			 struct data_update_opts,
++			 struct bch_ratelimit *,
++			 struct bch_move_stats *,
++			 struct write_point_specifier,
++			 bool);
++int bch2_data_job(struct bch_fs *,
++		  struct bch_move_stats *,
++		  struct bch_ioctl_data);
++
++inline void bch_move_stats_init(struct bch_move_stats *stats,
++				char *name);
++
++
++#endif /* _BCACHEFS_MOVE_H */
+diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
+new file mode 100644
+index 000000000000..9df6d18137a5
+--- /dev/null
++++ b/fs/bcachefs/move_types.h
+@@ -0,0 +1,19 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_MOVE_TYPES_H
++#define _BCACHEFS_MOVE_TYPES_H
++
++struct bch_move_stats {
++	enum bch_data_type	data_type;
++	enum btree_id		btree_id;
++	struct bpos		pos;
++	struct list_head	list;
++	char			name[32];
++
++	atomic64_t		keys_moved;
++	atomic64_t		keys_raced;
++	atomic64_t		sectors_moved;
++	atomic64_t		sectors_seen;
++	atomic64_t		sectors_raced;
++};
++
++#endif /* _BCACHEFS_MOVE_TYPES_H */
+diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
+new file mode 100644
+index 000000000000..f913864eaa4f
+--- /dev/null
++++ b/fs/bcachefs/movinggc.c
+@@ -0,0 +1,285 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Moving/copying garbage collector
++ *
++ * Copyright 2012 Google, Inc.
++ */
++
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "alloc_foreground.h"
++#include "btree_iter.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "clock.h"
++#include "disk_groups.h"
++#include "errcode.h"
++#include "error.h"
++#include "extents.h"
++#include "eytzinger.h"
++#include "io.h"
++#include "keylist.h"
++#include "move.h"
++#include "movinggc.h"
++#include "super-io.h"
++
++#include <trace/events/bcachefs.h>
++#include <linux/freezer.h>
++#include <linux/kthread.h>
++#include <linux/math64.h>
++#include <linux/sched/task.h>
++#include <linux/sort.h>
++#include <linux/wait.h>
++
++static inline int fragmentation_cmp(copygc_heap *heap,
++				   struct copygc_heap_entry l,
++				   struct copygc_heap_entry r)
++{
++	return cmp_int(l.fragmentation, r.fragmentation);
++}
++
++static int find_buckets_to_copygc(struct bch_fs *c)
++{
++	copygc_heap *h = &c->copygc_heap;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_alloc_v4 a;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	/*
++	 * Find buckets with lowest sector counts, skipping completely
++	 * empty buckets, by building a maxheap sorted by sector count,
++	 * and repeatedly replacing the maximum element until all
++	 * buckets have been visited.
++	 */
++	h->used = 0;
++
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
++		struct copygc_heap_entry e;
++
++		bch2_alloc_to_v4(k, &a);
++
++		if ((a.data_type != BCH_DATA_btree &&
++		     a.data_type != BCH_DATA_user) ||
++		    a.dirty_sectors >= ca->mi.bucket_size ||
++		    bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
++			continue;
++
++		e = (struct copygc_heap_entry) {
++			.dev		= iter.pos.inode,
++			.gen		= a.gen,
++			.replicas	= 1 + a.stripe_redundancy,
++			.fragmentation	= div_u64((u64) a.dirty_sectors * (1ULL << 31),
++						  ca->mi.bucket_size),
++			.sectors	= a.dirty_sectors,
++			.bucket		= iter.pos.offset,
++		};
++		heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
++
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int bch2_copygc(struct bch_fs *c)
++{
++	copygc_heap *h = &c->copygc_heap;
++	struct copygc_heap_entry e;
++	struct bch_move_stats move_stats;
++	struct bch_dev *ca;
++	unsigned dev_idx;
++	size_t heap_size = 0;
++	struct moving_context ctxt;
++	struct data_update_opts data_opts = {
++		.btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
++	};
++	int ret = 0;
++
++	bch_move_stats_init(&move_stats, "copygc");
++
++	for_each_rw_member(ca, c, dev_idx)
++		heap_size += ca->mi.nbuckets >> 7;
++
++	if (h->size < heap_size) {
++		free_heap(&c->copygc_heap);
++		if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
++			bch_err(c, "error allocating copygc heap");
++			return 0;
++		}
++	}
++
++	ret = find_buckets_to_copygc(c);
++	if (ret) {
++		bch2_fs_fatal_error(c, "error walking buckets to copygc!");
++		return ret;
++	}
++
++	if (!h->used) {
++		s64 wait = S64_MAX, dev_wait;
++		u64 dev_min_wait_fragmented = 0;
++		u64 dev_min_wait_allowed = 0;
++		int dev_min_wait = -1;
++
++		for_each_rw_member(ca, c, dev_idx) {
++			struct bch_dev_usage usage = bch2_dev_usage_read(ca);
++			s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
++					       ca->mi.bucket_size) >> 1);
++			s64 fragmented = usage.d[BCH_DATA_user].fragmented;
++
++			dev_wait = max(0LL, allowed - fragmented);
++
++			if (dev_min_wait < 0 || dev_wait < wait) {
++				dev_min_wait = dev_idx;
++				dev_min_wait_fragmented = fragmented;
++				dev_min_wait_allowed	= allowed;
++			}
++		}
++
++		bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu",
++				    dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed);
++		return 0;
++	}
++
++	heap_resort(h, fragmentation_cmp, NULL);
++
++	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
++			      writepoint_ptr(&c->copygc_write_point),
++			      false);
++
++	/* not correct w.r.t. device removal */
++	while (h->used && !ret) {
++		BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
++		ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen,
++					     data_opts);
++	}
++
++	bch2_moving_ctxt_exit(&ctxt);
++
++	if (ret < 0)
++		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
++
++	trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
++	return ret;
++}
++
++/*
++ * Copygc runs when the amount of fragmented data is above some arbitrary
++ * threshold:
++ *
++ * The threshold at the limit - when the device is full - is the amount of space
++ * we reserved in bch2_recalc_capacity; we can't have more than that amount of
++ * disk space stranded due to fragmentation and store everything we have
++ * promised to store.
++ *
++ * But we don't want to be running copygc unnecessarily when the device still
++ * has plenty of free space - rather, we want copygc to smoothly run every so
++ * often and continually reduce the amount of fragmented space as the device
++ * fills up. So, we increase the threshold by half the current free space.
++ */
++unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned dev_idx;
++	s64 wait = S64_MAX, fragmented_allowed, fragmented;
++
++	for_each_rw_member(ca, c, dev_idx) {
++		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
++
++		fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
++				       ca->mi.bucket_size) >> 1);
++		fragmented = usage.d[BCH_DATA_user].fragmented;
++
++		wait = min(wait, max(0LL, fragmented_allowed - fragmented));
++	}
++
++	return wait;
++}
++
++static int bch2_copygc_thread(void *arg)
++{
++	struct bch_fs *c = arg;
++	struct io_clock *clock = &c->io_clock[WRITE];
++	u64 last, wait;
++	int ret = 0;
++
++	set_freezable();
++
++	while (!ret && !kthread_should_stop()) {
++		cond_resched();
++
++		if (kthread_wait_freezable(c->copy_gc_enabled))
++			break;
++
++		last = atomic64_read(&clock->now);
++		wait = bch2_copygc_wait_amount(c);
++
++		if (wait > clock->max_slop) {
++			trace_copygc_wait(c, wait, last + wait);
++			c->copygc_wait = last + wait;
++			bch2_kthread_io_clock_wait(clock, last + wait,
++					MAX_SCHEDULE_TIMEOUT);
++			continue;
++		}
++
++		c->copygc_wait = 0;
++
++		c->copygc_running = true;
++		ret = bch2_copygc(c);
++		c->copygc_running = false;
++
++		wake_up(&c->copygc_running_wq);
++	}
++
++	return 0;
++}
++
++void bch2_copygc_stop(struct bch_fs *c)
++{
++	if (c->copygc_thread) {
++		kthread_stop(c->copygc_thread);
++		put_task_struct(c->copygc_thread);
++	}
++	c->copygc_thread = NULL;
++}
++
++int bch2_copygc_start(struct bch_fs *c)
++{
++	struct task_struct *t;
++	int ret;
++
++	if (c->copygc_thread)
++		return 0;
++
++	if (c->opts.nochanges)
++		return 0;
++
++	if (bch2_fs_init_fault("copygc_start"))
++		return -ENOMEM;
++
++	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
++	ret = PTR_ERR_OR_ZERO(t);
++	if (ret) {
++		bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
++		return ret;
++	}
++
++	get_task_struct(t);
++
++	c->copygc_thread = t;
++	wake_up_process(c->copygc_thread);
++
++	return 0;
++}
++
++void bch2_fs_copygc_init(struct bch_fs *c)
++{
++	init_waitqueue_head(&c->copygc_running_wq);
++	c->copygc_running = false;
++}
+diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
+new file mode 100644
+index 000000000000..e85c8136a46e
+--- /dev/null
++++ b/fs/bcachefs/movinggc.h
+@@ -0,0 +1,10 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_MOVINGGC_H
++#define _BCACHEFS_MOVINGGC_H
++
++unsigned long bch2_copygc_wait_amount(struct bch_fs *);
++void bch2_copygc_stop(struct bch_fs *);
++int bch2_copygc_start(struct bch_fs *);
++void bch2_fs_copygc_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_MOVINGGC_H */
+diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
+new file mode 100644
+index 000000000000..407b221e8f6c
+--- /dev/null
++++ b/fs/bcachefs/opts.c
+@@ -0,0 +1,578 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include <linux/kernel.h>
++
++#include "bcachefs.h"
++#include "compress.h"
++#include "disk_groups.h"
++#include "opts.h"
++#include "super-io.h"
++#include "util.h"
++
++#include <linux/pretty-printers.h>
++
++#define x(t, n) [n] = #t,
++
++const char * const bch2_metadata_versions[] = {
++	BCH_METADATA_VERSIONS()
++	NULL
++};
++
++const char * const bch2_error_actions[] = {
++	BCH_ERROR_ACTIONS()
++	NULL
++};
++
++const char * const bch2_sb_features[] = {
++	BCH_SB_FEATURES()
++	NULL
++};
++
++const char * const bch2_sb_compat[] = {
++	BCH_SB_COMPAT()
++	NULL
++};
++
++const char * const bch2_btree_ids[] = {
++	BCH_BTREE_IDS()
++	"interior btree node",
++	NULL
++};
++
++const char * const bch2_csum_types[] = {
++	BCH_CSUM_TYPES()
++	NULL
++};
++
++const char * const bch2_csum_opts[] = {
++	BCH_CSUM_OPTS()
++	NULL
++};
++
++const char * const bch2_compression_types[] = {
++	BCH_COMPRESSION_TYPES()
++	NULL
++};
++
++const char * const bch2_compression_opts[] = {
++	BCH_COMPRESSION_OPTS()
++	NULL
++};
++
++const char * const bch2_str_hash_types[] = {
++	BCH_STR_HASH_TYPES()
++	NULL
++};
++
++const char * const bch2_str_hash_opts[] = {
++	BCH_STR_HASH_OPTS()
++	NULL
++};
++
++const char * const bch2_data_types[] = {
++	BCH_DATA_TYPES()
++	NULL
++};
++
++const char * const bch2_member_states[] = {
++	BCH_MEMBER_STATES()
++	NULL
++};
++
++const char * const bch2_jset_entry_types[] = {
++	BCH_JSET_ENTRY_TYPES()
++	NULL
++};
++
++const char * const bch2_fs_usage_types[] = {
++	BCH_FS_USAGE_TYPES()
++	NULL
++};
++
++#undef x
++
++const char * const bch2_d_types[BCH_DT_MAX] = {
++	[DT_UNKNOWN]	= "unknown",
++	[DT_FIFO]	= "fifo",
++	[DT_CHR]	= "chr",
++	[DT_DIR]	= "dir",
++	[DT_BLK]	= "blk",
++	[DT_REG]	= "reg",
++	[DT_LNK]	= "lnk",
++	[DT_SOCK]	= "sock",
++	[DT_WHT]	= "whiteout",
++	[DT_SUBVOL]	= "subvol",
++};
++
++u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
++{
++	BUG();
++}
++
++void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
++{
++	BUG();
++}
++
++void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
++{
++#define x(_name, ...)						\
++	if (opt_defined(src, _name))					\
++		opt_set(*dst, _name, src._name);
++
++	BCH_OPTS()
++#undef x
++}
++
++bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
++{
++	switch (id) {
++#define x(_name, ...)						\
++	case Opt_##_name:						\
++		return opt_defined(*opts, _name);
++	BCH_OPTS()
++#undef x
++	default:
++		BUG();
++	}
++}
++
++u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
++{
++	switch (id) {
++#define x(_name, ...)						\
++	case Opt_##_name:						\
++		return opts->_name;
++	BCH_OPTS()
++#undef x
++	default:
++		BUG();
++	}
++}
++
++void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
++{
++	switch (id) {
++#define x(_name, ...)						\
++	case Opt_##_name:						\
++		opt_set(*opts, _name, v);				\
++		break;
++	BCH_OPTS()
++#undef x
++	default:
++		BUG();
++	}
++}
++
++const struct bch_option bch2_opt_table[] = {
++#define OPT_BOOL()		.type = BCH_OPT_BOOL, .min = 0, .max = 2
++#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT,			\
++				.min = _min, .max = _max
++#define OPT_STR(_choices)	.type = BCH_OPT_STR,			\
++				.min = 0, .max = ARRAY_SIZE(_choices),\
++				.choices = _choices
++#define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
++				.parse = _fn##_parse,			\
++				.to_text = _fn##_to_text
++
++#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
++	[Opt_##_name] = {						\
++		.attr	= {						\
++			.name	= #_name,				\
++			.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444,	\
++		},							\
++		.flags	= _flags,					\
++		.hint	= _hint,					\
++		.help	= _help,					\
++		.get_sb = _sb_opt,					\
++		.set_sb	= SET_##_sb_opt,				\
++		_type							\
++	},
++
++	BCH_OPTS()
++#undef x
++};
++
++int bch2_opt_lookup(const char *name)
++{
++	const struct bch_option *i;
++
++	for (i = bch2_opt_table;
++	     i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
++	     i++)
++		if (!strcmp(name, i->attr.name))
++			return i - bch2_opt_table;
++
++	return -1;
++}
++
++struct synonym {
++	const char	*s1, *s2;
++};
++
++static const struct synonym bch_opt_synonyms[] = {
++	{ "quota",	"usrquota" },
++};
++
++static int bch2_mount_opt_lookup(const char *name)
++{
++	const struct synonym *i;
++
++	for (i = bch_opt_synonyms;
++	     i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
++	     i++)
++		if (!strcmp(name, i->s1))
++			name = i->s2;
++
++	return bch2_opt_lookup(name);
++}
++
++int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
++{
++	if (v < opt->min) {
++		if (err)
++			prt_printf(err, "%s: too small (min %llu)",
++			       opt->attr.name, opt->min);
++		return -ERANGE;
++	}
++
++	if (opt->max && v >= opt->max) {
++		if (err)
++			prt_printf(err, "%s: too big (max %llu)",
++			       opt->attr.name, opt->max);
++		return -ERANGE;
++	}
++
++	if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
++		if (err)
++			prt_printf(err, "%s: not a multiple of 512",
++			       opt->attr.name);
++		return -EINVAL;
++	}
++
++	if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
++		if (err)
++			prt_printf(err, "%s: must be a power of two",
++			       opt->attr.name);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int bch2_opt_parse(struct bch_fs *c,
++		   const struct bch_option *opt,
++		   const char *val, u64 *res,
++		   struct printbuf *err)
++{
++	ssize_t ret;
++
++	switch (opt->type) {
++	case BCH_OPT_BOOL:
++		ret = kstrtou64(val, 10, res);
++		if (ret < 0 || (*res != 0 && *res != 1)) {
++			prt_printf(err, "%s: must be bool",
++			       opt->attr.name);
++			return ret;
++		}
++		break;
++	case BCH_OPT_UINT:
++		ret = opt->flags & OPT_HUMAN_READABLE
++			? bch2_strtou64_h(val, res)
++			: kstrtou64(val, 10, res);
++		if (ret < 0) {
++			if (err)
++				prt_printf(err, "%s: must be a number",
++				       opt->attr.name);
++			return ret;
++		}
++		break;
++	case BCH_OPT_STR:
++		ret = match_string(opt->choices, -1, val);
++		if (ret < 0) {
++			if (err)
++				prt_printf(err, "%s: invalid selection",
++				       opt->attr.name);
++			return ret;
++		}
++
++		*res = ret;
++		break;
++	case BCH_OPT_FN:
++		if (!c)
++			return 0;
++
++		ret = opt->parse(c, val, res);
++		if (ret < 0) {
++			if (err)
++				prt_printf(err, "%s: parse error",
++				       opt->attr.name);
++			return ret;
++		}
++	}
++
++	return bch2_opt_validate(opt, *res, err);
++}
++
++void bch2_opt_to_text(struct printbuf *out,
++		      struct bch_fs *c, struct bch_sb *sb,
++		      const struct bch_option *opt, u64 v,
++		      unsigned flags)
++{
++	if (flags & OPT_SHOW_MOUNT_STYLE) {
++		if (opt->type == BCH_OPT_BOOL) {
++			prt_printf(out, "%s%s",
++			       v ? "" : "no",
++			       opt->attr.name);
++			return;
++		}
++
++		prt_printf(out, "%s=", opt->attr.name);
++	}
++
++	switch (opt->type) {
++	case BCH_OPT_BOOL:
++	case BCH_OPT_UINT:
++		if (opt->flags & OPT_HUMAN_READABLE)
++			prt_human_readable_u64(out, v);
++		else
++			prt_printf(out, "%lli", v);
++		break;
++	case BCH_OPT_STR:
++		if (flags & OPT_SHOW_FULL_LIST)
++			prt_string_option(out, opt->choices, v);
++		else
++			prt_printf(out, "%s", opt->choices[v]);
++		break;
++	case BCH_OPT_FN:
++		opt->to_text(out, c, sb, v);
++		break;
++	default:
++		BUG();
++	}
++}
++
++int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
++{
++	int ret = 0;
++
++	switch (id) {
++	case Opt_compression:
++	case Opt_background_compression:
++		ret = bch2_check_set_has_compressed_data(c, v);
++		break;
++	case Opt_erasure_code:
++		if (v)
++			bch2_check_set_feature(c, BCH_FEATURE_ec);
++		break;
++	}
++
++	return ret;
++}
++
++int bch2_opts_check_may_set(struct bch_fs *c)
++{
++	unsigned i;
++	int ret;
++
++	for (i = 0; i < bch2_opts_nr; i++) {
++		ret = bch2_opt_check_may_set(c, i,
++				bch2_opt_get_by_id(&c->opts, i));
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
++			  char *options)
++{
++	char *copied_opts, *copied_opts_start;
++	char *opt, *name, *val;
++	int ret, id;
++	struct printbuf err = PRINTBUF;
++	u64 v;
++
++	if (!options)
++		return 0;
++
++	copied_opts = kstrdup(options, GFP_KERNEL);
++	if (!copied_opts)
++		return -1;
++	copied_opts_start = copied_opts;
++
++	while ((opt = strsep(&copied_opts, ",")) != NULL) {
++		name	= strsep(&opt, "=");
++		val	= opt;
++
++		if (val) {
++			id = bch2_mount_opt_lookup(name);
++			if (id < 0)
++				goto bad_opt;
++
++			ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
++			if (ret < 0)
++				goto bad_val;
++		} else {
++			id = bch2_mount_opt_lookup(name);
++			v = 1;
++
++			if (id < 0 &&
++			    !strncmp("no", name, 2)) {
++				id = bch2_mount_opt_lookup(name + 2);
++				v = 0;
++			}
++
++			if (id < 0)
++				goto bad_opt;
++
++			if (bch2_opt_table[id].type != BCH_OPT_BOOL)
++				goto no_val;
++		}
++
++		if (!(bch2_opt_table[id].flags & OPT_MOUNT))
++			goto bad_opt;
++
++		if (id == Opt_acl &&
++		    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
++			goto bad_opt;
++
++		if ((id == Opt_usrquota ||
++		     id == Opt_grpquota) &&
++		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
++			goto bad_opt;
++
++		bch2_opt_set_by_id(opts, id, v);
++	}
++
++	ret = 0;
++	goto out;
++
++bad_opt:
++	pr_err("Bad mount option %s", name);
++	ret = -1;
++	goto out;
++bad_val:
++	pr_err("Invalid mount option %s", err.buf);
++	ret = -1;
++	goto out;
++no_val:
++	pr_err("Mount option %s requires a value", name);
++	ret = -1;
++	goto out;
++out:
++	kfree(copied_opts_start);
++	printbuf_exit(&err);
++	return ret;
++}
++
++u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
++{
++	const struct bch_option *opt = bch2_opt_table + id;
++	u64 v;
++
++	v = opt->get_sb(sb);
++
++	if (opt->flags & OPT_SB_FIELD_ILOG2)
++		v = 1ULL << v;
++
++	if (opt->flags & OPT_SB_FIELD_SECTORS)
++		v <<= 9;
++
++	return v;
++}
++
++/*
++ * Initial options from superblock - here we don't want any options undefined,
++ * any options the superblock doesn't specify are set to 0:
++ */
++int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
++{
++	unsigned id;
++
++	for (id = 0; id < bch2_opts_nr; id++) {
++		const struct bch_option *opt = bch2_opt_table + id;
++
++		if (opt->get_sb == BCH2_NO_SB_OPT)
++			continue;
++
++		bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
++	}
++
++	return 0;
++}
++
++void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
++{
++	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
++		return;
++
++	if (opt->flags & OPT_SB_FIELD_SECTORS)
++		v >>= 9;
++
++	if (opt->flags & OPT_SB_FIELD_ILOG2)
++		v = ilog2(v);
++
++	opt->set_sb(sb, v);
++}
++
++void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
++{
++	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
++		return;
++
++	mutex_lock(&c->sb_lock);
++	__bch2_opt_set_sb(c->disk_sb.sb, opt, v);
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++}
++
++/* io opts: */
++
++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
++{
++	struct bch_io_opts ret = { 0 };
++#define x(_name, _bits)					\
++	if (opt_defined(src, _name))					\
++		opt_set(ret, _name, src._name);
++	BCH_INODE_OPTS()
++#undef x
++	return ret;
++}
++
++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
++{
++	struct bch_opts ret = { 0 };
++#define x(_name, _bits)					\
++	if (opt_defined(src, _name))					\
++		opt_set(ret, _name, src._name);
++	BCH_INODE_OPTS()
++#undef x
++	return ret;
++}
++
++void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
++{
++#define x(_name, _bits)					\
++	if (opt_defined(src, _name))					\
++		opt_set(*dst, _name, src._name);
++	BCH_INODE_OPTS()
++#undef x
++}
++
++bool bch2_opt_is_inode_opt(enum bch_opt_id id)
++{
++	static const enum bch_opt_id inode_opt_list[] = {
++#define x(_name, _bits)	Opt_##_name,
++	BCH_INODE_OPTS()
++#undef x
++	};
++	unsigned i;
++
++	for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
++		if (inode_opt_list[i] == id)
++			return true;
++
++	return false;
++}
+diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
+new file mode 100644
+index 000000000000..5b8586ecb374
+--- /dev/null
++++ b/fs/bcachefs/opts.h
+@@ -0,0 +1,509 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_OPTS_H
++#define _BCACHEFS_OPTS_H
++
++#include <linux/bug.h>
++#include <linux/log2.h>
++#include <linux/string.h>
++#include <linux/sysfs.h>
++#include "bcachefs_format.h"
++
++extern const char * const bch2_metadata_versions[];
++extern const char * const bch2_error_actions[];
++extern const char * const bch2_sb_features[];
++extern const char * const bch2_sb_compat[];
++extern const char * const bch2_btree_ids[];
++extern const char * const bch2_csum_types[];
++extern const char * const bch2_csum_opts[];
++extern const char * const bch2_compression_types[];
++extern const char * const bch2_compression_opts[];
++extern const char * const bch2_str_hash_types[];
++extern const char * const bch2_str_hash_opts[];
++extern const char * const bch2_data_types[];
++extern const char * const bch2_member_states[];
++extern const char * const bch2_jset_entry_types[];
++extern const char * const bch2_fs_usage_types[];
++extern const char * const bch2_d_types[];
++
++static inline const char *bch2_d_type_str(unsigned d_type)
++{
++	return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
++}
++
++/*
++ * Mount options; we also store defaults in the superblock.
++ *
++ * Also exposed via sysfs: if an option is writeable, and it's also stored in
++ * the superblock, changing it via sysfs (currently? might change this) also
++ * updates the superblock.
++ *
++ * We store options as signed integers, where -1 means undefined. This means we
++ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
++ * apply the options from that struct that are defined.
++ */
++
++/* dummy option, for options that aren't stored in the superblock */
++u64 BCH2_NO_SB_OPT(const struct bch_sb *);
++void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
++
++/* When can be set: */
++enum opt_flags {
++	OPT_FS		= (1 << 0),	/* Filesystem option */
++	OPT_DEVICE	= (1 << 1),	/* Device option */
++	OPT_INODE	= (1 << 2),	/* Inode option */
++	OPT_FORMAT	= (1 << 3),	/* May be specified at format time */
++	OPT_MOUNT	= (1 << 4),	/* May be specified at mount time */
++	OPT_RUNTIME	= (1 << 5),	/* May be specified at runtime */
++	OPT_HUMAN_READABLE = (1 << 6),
++	OPT_MUST_BE_POW_2 = (1 << 7),	/* Must be power of 2 */
++	OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
++	OPT_SB_FIELD_ILOG2 = (1 << 9),	/* Superblock field is ilog2 of actual value */
++};
++
++enum opt_type {
++	BCH_OPT_BOOL,
++	BCH_OPT_UINT,
++	BCH_OPT_STR,
++	BCH_OPT_FN,
++};
++
++/**
++ * x(name, shortopt, type, in mem type, mode, sb_opt)
++ *
++ * @name	- name of mount option, sysfs attribute, and struct bch_opts
++ *		  member
++ *
++ * @mode	- when opt may be set
++ *
++ * @sb_option	- name of corresponding superblock option
++ *
++ * @type	- one of OPT_BOOL, OPT_UINT, OPT_STR
++ */
++
++/*
++ * XXX: add fields for
++ *  - default value
++ *  - helptext
++ */
++
++#ifdef __KERNEL__
++#define RATELIMIT_ERRORS_DEFAULT true
++#else
++#define RATELIMIT_ERRORS_DEFAULT false
++#endif
++
++#define BCH_OPTS()							\
++	x(block_size,			u16,				\
++	  OPT_FS|OPT_FORMAT|						\
++	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
++	  OPT_UINT(512, 1U << 16),					\
++	  BCH_SB_BLOCK_SIZE,		8,				\
++	  "size",	NULL)						\
++	x(btree_node_size,		u32,				\
++	  OPT_FS|OPT_FORMAT|						\
++	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
++	  OPT_UINT(512, 1U << 20),					\
++	  BCH_SB_BTREE_NODE_SIZE,	512,				\
++	  "size",	"Btree node size, default 256k")		\
++	x(errors,			u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
++	  OPT_STR(bch2_error_actions),					\
++	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_ro,		\
++	  NULL,		"Action to take on filesystem error")		\
++	x(metadata_replicas,		u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
++	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
++	  BCH_SB_META_REPLICAS_WANT,	1,				\
++	  "#",		"Number of metadata replicas")			\
++	x(data_replicas,		u8,				\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
++	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
++	  "#",		"Number of data replicas")			\
++	x(metadata_replicas_required, u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
++	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
++	  BCH_SB_META_REPLICAS_REQ,	1,				\
++	  "#",		NULL)						\
++	x(data_replicas_required,	u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
++	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
++	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
++	  "#",		NULL)						\
++	x(encoded_extent_max,		u32,				\
++	  OPT_FS|OPT_FORMAT|						\
++	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
++	  OPT_UINT(4096, 2U << 20),					\
++	  BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,			\
++	  "size",	"Maximum size of checksummed/compressed extents")\
++	x(metadata_checksum,		u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
++	  OPT_STR(bch2_csum_opts),					\
++	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
++	  NULL,		NULL)						\
++	x(data_checksum,		u8,				\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_STR(bch2_csum_opts),					\
++	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
++	  NULL,		NULL)						\
++	x(compression,			u8,				\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_STR(bch2_compression_opts),				\
++	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
++	  NULL,		NULL)						\
++	x(background_compression,	u8,				\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_STR(bch2_compression_opts),				\
++	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
++	  NULL,		NULL)						\
++	x(str_hash,			u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
++	  OPT_STR(bch2_str_hash_opts),					\
++	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
++	  NULL,		"Hash function for directory entries and xattrs")\
++	x(metadata_target,		u16,				\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_FN(bch2_opt_target),					\
++	  BCH_SB_METADATA_TARGET,	0,				\
++	  "(target)",	"Device or label for metadata writes")		\
++	x(foreground_target,		u16,				\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_FN(bch2_opt_target),					\
++	  BCH_SB_FOREGROUND_TARGET,	0,				\
++	  "(target)",	"Device or label for foreground writes")	\
++	x(background_target,		u16,				\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_FN(bch2_opt_target),					\
++	  BCH_SB_BACKGROUND_TARGET,	0,				\
++	  "(target)",	"Device or label to move data to in the background")\
++	x(promote_target,		u16,				\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_FN(bch2_opt_target),					\
++	  BCH_SB_PROMOTE_TARGET,	0,				\
++	  "(target)",	"Device or label to promote data to on read")	\
++	x(erasure_code,			u16,				\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_BOOL(),							\
++	  BCH_SB_ERASURE_CODE,		false,				\
++	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
++	x(inodes_32bit,			u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
++	  OPT_BOOL(),							\
++	  BCH_SB_INODE_32BIT,		true,				\
++	  NULL,		"Constrain inode numbers to 32 bits")		\
++	x(shard_inode_numbers,		u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
++	  OPT_BOOL(),							\
++	  BCH_SB_SHARD_INUMS,		true,				\
++	  NULL,		"Shard new inode numbers by CPU id")		\
++	x(inodes_use_key_cache,	u8,					\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
++	  OPT_BOOL(),							\
++	  BCH_SB_INODES_USE_KEY_CACHE,	true,				\
++	  NULL,		"Use the btree key cache for the inodes btree")	\
++	x(btree_node_mem_ptr_optimization, u8,				\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		true,				\
++	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
++	x(gc_reserve_percent,		u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
++	  OPT_UINT(5, 21),						\
++	  BCH_SB_GC_RESERVE,		8,				\
++	  "%",		"Percentage of disk space to reserve for copygc")\
++	x(gc_reserve_bytes,		u64,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|			\
++	  OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,			\
++	  OPT_UINT(0, U64_MAX),						\
++	  BCH_SB_GC_RESERVE_BYTES,	0,				\
++	  "%",		"Amount of disk space to reserve for copygc\n"	\
++			"Takes precedence over gc_reserve_percent if set")\
++	x(root_reserve_percent,		u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
++	  OPT_UINT(0, 100),						\
++	  BCH_SB_ROOT_RESERVE,		0,				\
++	  "%",		"Percentage of disk space to reserve for superuser")\
++	x(wide_macs,			u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
++	  OPT_BOOL(),							\
++	  BCH_SB_128_BIT_MACS,		false,				\
++	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
++	x(inline_data,			u8,				\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		true,				\
++	  NULL,		"Enable inline data extents")			\
++	x(acl,				u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
++	  OPT_BOOL(),							\
++	  BCH_SB_POSIX_ACL,		true,				\
++	  NULL,		"Enable POSIX acls")				\
++	x(usrquota,			u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
++	  OPT_BOOL(),							\
++	  BCH_SB_USRQUOTA,		false,				\
++	  NULL,		"Enable user quotas")				\
++	x(grpquota,			u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
++	  OPT_BOOL(),							\
++	  BCH_SB_GRPQUOTA,		false,				\
++	  NULL,		"Enable group quotas")				\
++	x(prjquota,			u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
++	  OPT_BOOL(),							\
++	  BCH_SB_PRJQUOTA,		false,				\
++	  NULL,		"Enable project quotas")			\
++	x(degraded,			u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		false,				\
++	  NULL,		"Allow mounting in degraded mode")		\
++	x(very_degraded,		u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		false,				\
++	  NULL,		"Allow mounting in when data will be missing")	\
++	x(discard,			u8,				\
++	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		true,				\
++	  NULL,		"Enable discard/TRIM support")			\
++	x(verbose,			u8,				\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		false,				\
++	  NULL,		"Extra debugging information during mount/recovery")\
++	x(journal_flush_delay,		u32,				\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_UINT(1, U32_MAX),						\
++	  BCH_SB_JOURNAL_FLUSH_DELAY,	1000,				\
++	  NULL,		"Delay in milliseconds before automatic journal commits")\
++	x(journal_flush_disabled,	u8,				\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_BOOL(),							\
++	  BCH_SB_JOURNAL_FLUSH_DISABLED,false,				\
++	  NULL,		"Disable journal flush on sync/fsync\n"		\
++			"If enabled, writes can be lost, but only since the\n"\
++			"last journal write (default 1 second)")	\
++	x(journal_reclaim_delay,	u32,				\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_UINT(0, U32_MAX),						\
++	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
++	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
++	x(move_bytes_in_flight,		u32,				\
++	  OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,		\
++	  OPT_UINT(1024, U32_MAX),					\
++	  BCH2_NO_SB_OPT,		1U << 20,			\
++	  NULL,		"Amount of IO in flight to keep in flight by the move path")\
++	x(fsck,				u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		false,				\
++	  NULL,		"Run fsck on mount")				\
++	x(fix_errors,			u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		false,				\
++	  NULL,		"Fix errors during fsck without asking")	\
++	x(ratelimit_errors,		u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		RATELIMIT_ERRORS_DEFAULT,	\
++	  NULL,		"Ratelimit error messages during fsck")		\
++	x(nochanges,			u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,		false,				\
++	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
++			"even if we have to replay the journal")	\
++	x(norecovery,			u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Don't replay the journal")			\
++	x(keep_journal,			u8,				\
++	  0,								\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Don't free journal entries/keys after startup")\
++	x(read_entire_journal,		u8,				\
++	  0,								\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Read all journal entries, not just dirty ones")\
++	x(read_journal_only,		u8,				\
++	  0,								\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Only read the journal, skip the rest of recovery")\
++	x(noexcl,			u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Don't open device in exclusive mode")		\
++	x(direct_io,			u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			true,			\
++	  NULL,		"Use O_DIRECT (userspace only)")		\
++	x(sb,				u64,				\
++	  OPT_MOUNT,							\
++	  OPT_UINT(0, S64_MAX),						\
++	  BCH2_NO_SB_OPT,			BCH_SB_SECTOR,			\
++	  "offset",	"Sector offset of superblock")			\
++	x(read_only,			u8,				\
++	  OPT_FS,							\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		NULL)						\
++	x(nostart,			u8,				\
++	  0,								\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Don\'t start filesystem, only open devices")	\
++	x(reconstruct_alloc,		u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Reconstruct alloc btree")			\
++	x(version_upgrade,		u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Set superblock to latest version,\n"		\
++			"allowing any new features to be used")		\
++	x(buckets_nouse,		u8,				\
++	  0,								\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Allocate the buckets_nouse bitmap")		\
++	x(project,			u8,				\
++	  OPT_INODE,							\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		NULL)						\
++	x(fs_size,			u64,				\
++	  OPT_DEVICE,							\
++	  OPT_UINT(0, S64_MAX),						\
++	  BCH2_NO_SB_OPT,		0,				\
++	  "size",	"Size of filesystem on device")			\
++	x(bucket,			u32,				\
++	  OPT_DEVICE,							\
++	  OPT_UINT(0, S64_MAX),						\
++	  BCH2_NO_SB_OPT,		0,				\
++	  "size",	"Size of filesystem on device")			\
++	x(durability,			u8,				\
++	  OPT_DEVICE,							\
++	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
++	  BCH2_NO_SB_OPT,		1,				\
++	  "n",		"Data written to this device will be considered\n"\
++			"to have already been replicated n times")
++
++struct bch_opts {
++#define x(_name, _bits, ...)	unsigned _name##_defined:1;
++	BCH_OPTS()
++#undef x
++
++#define x(_name, _bits, ...)	_bits	_name;
++	BCH_OPTS()
++#undef x
++};
++
++static const struct bch_opts bch2_opts_default = {
++#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
++	._name##_defined = true,					\
++	._name = _default,						\
++
++	BCH_OPTS()
++#undef x
++};
++
++#define opt_defined(_opts, _name)	((_opts)._name##_defined)
++
++#define opt_get(_opts, _name)						\
++	(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
++
++#define opt_set(_opts, _name, _v)					\
++do {									\
++	(_opts)._name##_defined = true;					\
++	(_opts)._name = _v;						\
++} while (0)
++
++static inline struct bch_opts bch2_opts_empty(void)
++{
++	return (struct bch_opts) { 0 };
++}
++
++void bch2_opts_apply(struct bch_opts *, struct bch_opts);
++
++enum bch_opt_id {
++#define x(_name, ...)	Opt_##_name,
++	BCH_OPTS()
++#undef x
++	bch2_opts_nr
++};
++
++struct bch_fs;
++struct printbuf;
++
++struct bch_option {
++	struct attribute	attr;
++	u64			(*get_sb)(const struct bch_sb *);
++	void			(*set_sb)(struct bch_sb *, u64);
++	enum opt_type		type;
++	enum opt_flags		flags;
++	u64			min, max;
++
++	const char * const *choices;
++	int (*parse)(struct bch_fs *, const char *, u64 *);
++	void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
++
++	const char		*hint;
++	const char		*help;
++
++};
++
++extern const struct bch_option bch2_opt_table[];
++
++bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
++u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
++void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
++
++u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
++int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
++void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
++void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
++
++int bch2_opt_lookup(const char *);
++int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
++int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
++		   const char *, u64 *, struct printbuf *);
++
++#define OPT_SHOW_FULL_LIST	(1 << 0)
++#define OPT_SHOW_MOUNT_STYLE	(1 << 1)
++
++void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
++		      const struct bch_option *, u64, unsigned);
++
++int bch2_opt_check_may_set(struct bch_fs *, int, u64);
++int bch2_opts_check_may_set(struct bch_fs *);
++int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
++
++/* inode opts: */
++
++struct bch_io_opts {
++#define x(_name, _bits)	unsigned _name##_defined:1;
++	BCH_INODE_OPTS()
++#undef x
++
++#define x(_name, _bits)	u##_bits _name;
++	BCH_INODE_OPTS()
++#undef x
++};
++
++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
++void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
++bool bch2_opt_is_inode_opt(enum bch_opt_id);
++
++#endif /* _BCACHEFS_OPTS_H */
+diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
+new file mode 100644
+index 000000000000..454c76e03be9
+--- /dev/null
++++ b/fs/bcachefs/quota.c
+@@ -0,0 +1,823 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "btree_update.h"
++#include "errcode.h"
++#include "inode.h"
++#include "quota.h"
++#include "subvolume.h"
++#include "super-io.h"
++
++static const char * const bch2_quota_types[] = {
++	"user",
++	"group",
++	"project",
++};
++
++static const char * const bch2_quota_counters[] = {
++	"space",
++	"inodes",
++};
++
++static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
++				  struct printbuf *err)
++{
++	struct bch_sb_field_quota *q = field_to_type(f, quota);
++
++	if (vstruct_bytes(&q->field) < sizeof(*q)) {
++		prt_printf(err, "wrong size (got %zu should be %zu)",
++		       vstruct_bytes(&q->field), sizeof(*q));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
++				  struct bch_sb_field *f)
++{
++	struct bch_sb_field_quota *q = field_to_type(f, quota);
++	unsigned qtyp, counter;
++
++	for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
++		prt_printf(out, "%s: flags %llx",
++		       bch2_quota_types[qtyp],
++		       le64_to_cpu(q->q[qtyp].flags));
++
++		for (counter = 0; counter < Q_COUNTERS; counter++)
++			prt_printf(out, " %s timelimit %u warnlimit %u",
++			       bch2_quota_counters[counter],
++			       le32_to_cpu(q->q[qtyp].c[counter].timelimit),
++			       le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
++
++		prt_newline(out);
++	}
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_quota = {
++	.validate	= bch2_sb_quota_validate,
++	.to_text	= bch2_sb_quota_to_text,
++};
++
++int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
++		       int rw, struct printbuf *err)
++{
++	if (k.k->p.inode >= QTYP_NR) {
++		prt_printf(err, "invalid quota type (%llu >= %u)",
++		       k.k->p.inode, QTYP_NR);
++		return -EINVAL;
++	}
++
++	if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) {
++		prt_printf(err, "incorrect value size (%zu != %zu)",
++		       bkey_val_bytes(k.k), sizeof(struct bch_quota));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
++			struct bkey_s_c k)
++{
++	struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
++	unsigned i;
++
++	for (i = 0; i < Q_COUNTERS; i++)
++		prt_printf(out, "%s hardlimit %llu softlimit %llu",
++		       bch2_quota_counters[i],
++		       le64_to_cpu(dq.v->c[i].hardlimit),
++		       le64_to_cpu(dq.v->c[i].softlimit));
++}
++
++#ifdef CONFIG_BCACHEFS_QUOTA
++
++#include <linux/cred.h>
++#include <linux/fs.h>
++#include <linux/quota.h>
++
++static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
++{
++	qtypes >>= i;
++	return qtypes ? i + __ffs(qtypes) : QTYP_NR;
++}
++
++#define for_each_set_qtype(_c, _i, _q, _qtypes)				\
++	for (_i = 0;							\
++	     (_i = __next_qtype(_i, _qtypes),				\
++	      _q = &(_c)->quotas[_i],					\
++	      _i < QTYP_NR);						\
++	     _i++)
++
++static bool ignore_hardlimit(struct bch_memquota_type *q)
++{
++	if (capable(CAP_SYS_RESOURCE))
++		return true;
++#if 0
++	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
++
++	return capable(CAP_SYS_RESOURCE) &&
++	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
++		!(info->dqi_flags & DQF_ROOT_SQUASH));
++#endif
++	return false;
++}
++
++enum quota_msg {
++	SOFTWARN,	/* Softlimit reached */
++	SOFTLONGWARN,	/* Grace time expired */
++	HARDWARN,	/* Hardlimit reached */
++
++	HARDBELOW,	/* Usage got below inode hardlimit */
++	SOFTBELOW,	/* Usage got below inode softlimit */
++};
++
++static int quota_nl[][Q_COUNTERS] = {
++	[HARDWARN][Q_SPC]	= QUOTA_NL_BHARDWARN,
++	[SOFTLONGWARN][Q_SPC]	= QUOTA_NL_BSOFTLONGWARN,
++	[SOFTWARN][Q_SPC]	= QUOTA_NL_BSOFTWARN,
++	[HARDBELOW][Q_SPC]	= QUOTA_NL_BHARDBELOW,
++	[SOFTBELOW][Q_SPC]	= QUOTA_NL_BSOFTBELOW,
++
++	[HARDWARN][Q_INO]	= QUOTA_NL_IHARDWARN,
++	[SOFTLONGWARN][Q_INO]	= QUOTA_NL_ISOFTLONGWARN,
++	[SOFTWARN][Q_INO]	= QUOTA_NL_ISOFTWARN,
++	[HARDBELOW][Q_INO]	= QUOTA_NL_IHARDBELOW,
++	[SOFTBELOW][Q_INO]	= QUOTA_NL_ISOFTBELOW,
++};
++
++struct quota_msgs {
++	u8		nr;
++	struct {
++		u8	qtype;
++		u8	msg;
++	}		m[QTYP_NR * Q_COUNTERS];
++};
++
++static void prepare_msg(unsigned qtype,
++			enum quota_counters counter,
++			struct quota_msgs *msgs,
++			enum quota_msg msg_type)
++{
++	BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
++
++	msgs->m[msgs->nr].qtype	= qtype;
++	msgs->m[msgs->nr].msg	= quota_nl[msg_type][counter];
++	msgs->nr++;
++}
++
++static void prepare_warning(struct memquota_counter *qc,
++			    unsigned qtype,
++			    enum quota_counters counter,
++			    struct quota_msgs *msgs,
++			    enum quota_msg msg_type)
++{
++	if (qc->warning_issued & (1 << msg_type))
++		return;
++
++	prepare_msg(qtype, counter, msgs, msg_type);
++}
++
++static void flush_warnings(struct bch_qid qid,
++			   struct super_block *sb,
++			   struct quota_msgs *msgs)
++{
++	unsigned i;
++
++	for (i = 0; i < msgs->nr; i++)
++		quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
++				   sb->s_dev, msgs->m[i].msg);
++}
++
++static int bch2_quota_check_limit(struct bch_fs *c,
++				  unsigned qtype,
++				  struct bch_memquota *mq,
++				  struct quota_msgs *msgs,
++				  enum quota_counters counter,
++				  s64 v,
++				  enum quota_acct_mode mode)
++{
++	struct bch_memquota_type *q = &c->quotas[qtype];
++	struct memquota_counter *qc = &mq->c[counter];
++	u64 n = qc->v + v;
++
++	BUG_ON((s64) n < 0);
++
++	if (mode == KEY_TYPE_QUOTA_NOCHECK)
++		return 0;
++
++	if (v <= 0) {
++		if (n < qc->hardlimit &&
++		    (qc->warning_issued & (1 << HARDWARN))) {
++			qc->warning_issued &= ~(1 << HARDWARN);
++			prepare_msg(qtype, counter, msgs, HARDBELOW);
++		}
++
++		if (n < qc->softlimit &&
++		    (qc->warning_issued & (1 << SOFTWARN))) {
++			qc->warning_issued &= ~(1 << SOFTWARN);
++			prepare_msg(qtype, counter, msgs, SOFTBELOW);
++		}
++
++		qc->warning_issued = 0;
++		return 0;
++	}
++
++	if (qc->hardlimit &&
++	    qc->hardlimit < n &&
++	    !ignore_hardlimit(q)) {
++		if (mode == KEY_TYPE_QUOTA_PREALLOC)
++			return -EDQUOT;
++
++		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
++	}
++
++	if (qc->softlimit &&
++	    qc->softlimit < n &&
++	    qc->timer &&
++	    ktime_get_real_seconds() >= qc->timer &&
++	    !ignore_hardlimit(q)) {
++		if (mode == KEY_TYPE_QUOTA_PREALLOC)
++			return -EDQUOT;
++
++		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
++	}
++
++	if (qc->softlimit &&
++	    qc->softlimit < n &&
++	    qc->timer == 0) {
++		if (mode == KEY_TYPE_QUOTA_PREALLOC)
++			return -EDQUOT;
++
++		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
++
++		/* XXX is this the right one? */
++		qc->timer = ktime_get_real_seconds() +
++			q->limits[counter].warnlimit;
++	}
++
++	return 0;
++}
++
++int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
++		    enum quota_counters counter, s64 v,
++		    enum quota_acct_mode mode)
++{
++	unsigned qtypes = enabled_qtypes(c);
++	struct bch_memquota_type *q;
++	struct bch_memquota *mq[QTYP_NR];
++	struct quota_msgs msgs;
++	unsigned i;
++	int ret = 0;
++
++	memset(&msgs, 0, sizeof(msgs));
++
++	for_each_set_qtype(c, i, q, qtypes)
++		mutex_lock_nested(&q->lock, i);
++
++	for_each_set_qtype(c, i, q, qtypes) {
++		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
++		if (!mq[i]) {
++			ret = -ENOMEM;
++			goto err;
++		}
++
++		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
++		if (ret)
++			goto err;
++	}
++
++	for_each_set_qtype(c, i, q, qtypes)
++		mq[i]->c[counter].v += v;
++err:
++	for_each_set_qtype(c, i, q, qtypes)
++		mutex_unlock(&q->lock);
++
++	flush_warnings(qid, c->vfs_sb, &msgs);
++
++	return ret;
++}
++
++static void __bch2_quota_transfer(struct bch_memquota *src_q,
++				  struct bch_memquota *dst_q,
++				  enum quota_counters counter, s64 v)
++{
++	BUG_ON(v > src_q->c[counter].v);
++	BUG_ON(v + dst_q->c[counter].v < v);
++
++	src_q->c[counter].v -= v;
++	dst_q->c[counter].v += v;
++}
++
++int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
++			struct bch_qid dst,
++			struct bch_qid src, u64 space,
++			enum quota_acct_mode mode)
++{
++	struct bch_memquota_type *q;
++	struct bch_memquota *src_q[3], *dst_q[3];
++	struct quota_msgs msgs;
++	unsigned i;
++	int ret = 0;
++
++	qtypes &= enabled_qtypes(c);
++
++	memset(&msgs, 0, sizeof(msgs));
++
++	for_each_set_qtype(c, i, q, qtypes)
++		mutex_lock_nested(&q->lock, i);
++
++	for_each_set_qtype(c, i, q, qtypes) {
++		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
++		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
++
++		if (!src_q[i] || !dst_q[i]) {
++			ret = -ENOMEM;
++			goto err;
++		}
++
++		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
++					     dst_q[i]->c[Q_SPC].v + space,
++					     mode);
++		if (ret)
++			goto err;
++
++		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
++					     dst_q[i]->c[Q_INO].v + 1,
++					     mode);
++		if (ret)
++			goto err;
++	}
++
++	for_each_set_qtype(c, i, q, qtypes) {
++		__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
++		__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
++	}
++
++err:
++	for_each_set_qtype(c, i, q, qtypes)
++		mutex_unlock(&q->lock);
++
++	flush_warnings(dst, c->vfs_sb, &msgs);
++
++	return ret;
++}
++
++static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
++{
++	struct bkey_s_c_quota dq;
++	struct bch_memquota_type *q;
++	struct bch_memquota *mq;
++	unsigned i;
++
++	BUG_ON(k.k->p.inode >= QTYP_NR);
++
++	if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
++		return 0;
++
++	switch (k.k->type) {
++	case KEY_TYPE_quota:
++		dq = bkey_s_c_to_quota(k);
++		q = &c->quotas[k.k->p.inode];
++
++		mutex_lock(&q->lock);
++		mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
++		if (!mq) {
++			mutex_unlock(&q->lock);
++			return -ENOMEM;
++		}
++
++		for (i = 0; i < Q_COUNTERS; i++) {
++			mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
++			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
++		}
++
++		mutex_unlock(&q->lock);
++	}
++
++	return 0;
++}
++
++void bch2_fs_quota_exit(struct bch_fs *c)
++{
++	unsigned i;
++
++	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
++		genradix_free(&c->quotas[i].table);
++}
++
++void bch2_fs_quota_init(struct bch_fs *c)
++{
++	unsigned i;
++
++	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
++		mutex_init(&c->quotas[i].lock);
++}
++
++static void bch2_sb_quota_read(struct bch_fs *c)
++{
++	struct bch_sb_field_quota *sb_quota;
++	unsigned i, j;
++
++	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
++	if (!sb_quota)
++		return;
++
++	for (i = 0; i < QTYP_NR; i++) {
++		struct bch_memquota_type *q = &c->quotas[i];
++
++		for (j = 0; j < Q_COUNTERS; j++) {
++			q->limits[j].timelimit =
++				le32_to_cpu(sb_quota->q[i].c[j].timelimit);
++			q->limits[j].warnlimit =
++				le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
++		}
++	}
++}
++
++static int bch2_fs_quota_read_inode(struct btree_trans *trans,
++				    struct btree_iter *iter,
++				    struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_inode_unpacked u;
++	struct bch_subvolume subvolume;
++	int ret;
++
++	ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
++	if (ret)
++		return ret;
++
++	/*
++	 * We don't do quota accounting in snapshots:
++	 */
++	if (BCH_SUBVOLUME_SNAP(&subvolume))
++		goto advance;
++
++	if (!bkey_is_inode(k.k))
++		goto advance;
++
++	ret = bch2_inode_unpack(k, &u);
++	if (ret)
++		return ret;
++
++	bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
++			KEY_TYPE_QUOTA_NOCHECK);
++	bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
++			KEY_TYPE_QUOTA_NOCHECK);
++advance:
++	bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1));
++	return 0;
++}
++
++int bch2_fs_quota_read(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	mutex_lock(&c->sb_lock);
++	bch2_sb_quota_read(c);
++	mutex_unlock(&c->sb_lock);
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
++			POS_MIN, BTREE_ITER_PREFETCH, k,
++		__bch2_quota_set(c, k)) ?:
++	      for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
++			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++		bch2_fs_quota_read_inode(&trans, &iter, k));
++	if (ret)
++		bch_err(c, "err in quota_read: %s", bch2_err_str(ret));
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++/* Enable/disable/delete quotas for an entire filesystem: */
++
++static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
++{
++	struct bch_fs *c = sb->s_fs_info;
++
++	if (sb->s_flags & SB_RDONLY)
++		return -EROFS;
++
++	/* Accounting must be enabled at mount time: */
++	if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
++		return -EINVAL;
++
++	/* Can't enable enforcement without accounting: */
++	if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
++		return -EINVAL;
++
++	if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
++		return -EINVAL;
++
++	if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
++		return -EINVAL;
++
++	mutex_lock(&c->sb_lock);
++	if (uflags & FS_QUOTA_UDQ_ENFD)
++		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
++
++	if (uflags & FS_QUOTA_GDQ_ENFD)
++		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
++
++	if (uflags & FS_QUOTA_PDQ_ENFD)
++		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
++
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	return 0;
++}
++
++static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
++{
++	struct bch_fs *c = sb->s_fs_info;
++
++	if (sb->s_flags & SB_RDONLY)
++		return -EROFS;
++
++	mutex_lock(&c->sb_lock);
++	if (uflags & FS_QUOTA_UDQ_ENFD)
++		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
++
++	if (uflags & FS_QUOTA_GDQ_ENFD)
++		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
++
++	if (uflags & FS_QUOTA_PDQ_ENFD)
++		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
++
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	return 0;
++}
++
++static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
++{
++	struct bch_fs *c = sb->s_fs_info;
++	int ret;
++
++	if (sb->s_flags & SB_RDONLY)
++		return -EROFS;
++
++	if (uflags & FS_USER_QUOTA) {
++		if (c->opts.usrquota)
++			return -EINVAL;
++
++		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
++					      POS(QTYP_USR, 0),
++					      POS(QTYP_USR + 1, 0),
++					      0, NULL);
++		if (ret)
++			return ret;
++	}
++
++	if (uflags & FS_GROUP_QUOTA) {
++		if (c->opts.grpquota)
++			return -EINVAL;
++
++		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
++					      POS(QTYP_GRP, 0),
++					      POS(QTYP_GRP + 1, 0),
++					      0, NULL);
++		if (ret)
++			return ret;
++	}
++
++	if (uflags & FS_PROJ_QUOTA) {
++		if (c->opts.prjquota)
++			return -EINVAL;
++
++		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
++					      POS(QTYP_PRJ, 0),
++					      POS(QTYP_PRJ + 1, 0),
++					      0, NULL);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++/*
++ * Return quota status information, such as enforcements, quota file inode
++ * numbers etc.
++ */
++static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
++{
++	struct bch_fs *c = sb->s_fs_info;
++	unsigned qtypes = enabled_qtypes(c);
++	unsigned i;
++
++	memset(state, 0, sizeof(*state));
++
++	for (i = 0; i < QTYP_NR; i++) {
++		state->s_state[i].flags |= QCI_SYSFILE;
++
++		if (!(qtypes & (1 << i)))
++			continue;
++
++		state->s_state[i].flags |= QCI_ACCT_ENABLED;
++
++		state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
++		state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
++
++		state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
++		state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
++	}
++
++	return 0;
++}
++
++/*
++ * Adjust quota timers & warnings
++ */
++static int bch2_quota_set_info(struct super_block *sb, int type,
++			       struct qc_info *info)
++{
++	struct bch_fs *c = sb->s_fs_info;
++	struct bch_sb_field_quota *sb_quota;
++	struct bch_memquota_type *q;
++
++	if (sb->s_flags & SB_RDONLY)
++		return -EROFS;
++
++	if (type >= QTYP_NR)
++		return -EINVAL;
++
++	if (!((1 << type) & enabled_qtypes(c)))
++		return -ESRCH;
++
++	if (info->i_fieldmask &
++	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
++		return -EINVAL;
++
++	q = &c->quotas[type];
++
++	mutex_lock(&c->sb_lock);
++	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
++	if (!sb_quota) {
++		sb_quota = bch2_sb_resize_quota(&c->disk_sb,
++					sizeof(*sb_quota) / sizeof(u64));
++		if (!sb_quota)
++			return -ENOSPC;
++	}
++
++	if (info->i_fieldmask & QC_SPC_TIMER)
++		sb_quota->q[type].c[Q_SPC].timelimit =
++			cpu_to_le32(info->i_spc_timelimit);
++
++	if (info->i_fieldmask & QC_SPC_WARNS)
++		sb_quota->q[type].c[Q_SPC].warnlimit =
++			cpu_to_le32(info->i_spc_warnlimit);
++
++	if (info->i_fieldmask & QC_INO_TIMER)
++		sb_quota->q[type].c[Q_INO].timelimit =
++			cpu_to_le32(info->i_ino_timelimit);
++
++	if (info->i_fieldmask & QC_INO_WARNS)
++		sb_quota->q[type].c[Q_INO].warnlimit =
++			cpu_to_le32(info->i_ino_warnlimit);
++
++	bch2_sb_quota_read(c);
++
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	return 0;
++}
++
++/* Get/set individual quotas: */
++
++static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
++{
++	dst->d_space		= src->c[Q_SPC].v << 9;
++	dst->d_spc_hardlimit	= src->c[Q_SPC].hardlimit << 9;
++	dst->d_spc_softlimit	= src->c[Q_SPC].softlimit << 9;
++	dst->d_spc_timer	= src->c[Q_SPC].timer;
++	dst->d_spc_warns	= src->c[Q_SPC].warns;
++
++	dst->d_ino_count	= src->c[Q_INO].v;
++	dst->d_ino_hardlimit	= src->c[Q_INO].hardlimit;
++	dst->d_ino_softlimit	= src->c[Q_INO].softlimit;
++	dst->d_ino_timer	= src->c[Q_INO].timer;
++	dst->d_ino_warns	= src->c[Q_INO].warns;
++}
++
++static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
++			  struct qc_dqblk *qdq)
++{
++	struct bch_fs *c		= sb->s_fs_info;
++	struct bch_memquota_type *q	= &c->quotas[kqid.type];
++	qid_t qid			= from_kqid(&init_user_ns, kqid);
++	struct bch_memquota *mq;
++
++	memset(qdq, 0, sizeof(*qdq));
++
++	mutex_lock(&q->lock);
++	mq = genradix_ptr(&q->table, qid);
++	if (mq)
++		__bch2_quota_get(qdq, mq);
++	mutex_unlock(&q->lock);
++
++	return 0;
++}
++
++static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
++			       struct qc_dqblk *qdq)
++{
++	struct bch_fs *c		= sb->s_fs_info;
++	struct bch_memquota_type *q	= &c->quotas[kqid->type];
++	qid_t qid			= from_kqid(&init_user_ns, *kqid);
++	struct genradix_iter iter;
++	struct bch_memquota *mq;
++	int ret = 0;
++
++	mutex_lock(&q->lock);
++
++	genradix_for_each_from(&q->table, iter, mq, qid)
++		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
++			__bch2_quota_get(qdq, mq);
++			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
++			goto found;
++		}
++
++	ret = -ENOENT;
++found:
++	mutex_unlock(&q->lock);
++	return ret;
++}
++
++static int bch2_set_quota_trans(struct btree_trans *trans,
++				struct bkey_i_quota *new_quota,
++				struct qc_dqblk *qdq)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
++			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(&iter);
++
++	ret = bkey_err(k);
++	if (unlikely(ret))
++		return ret;
++
++	if (k.k->type == KEY_TYPE_quota)
++		new_quota->v = *bkey_s_c_to_quota(k).v;
++
++	if (qdq->d_fieldmask & QC_SPC_SOFT)
++		new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
++	if (qdq->d_fieldmask & QC_SPC_HARD)
++		new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
++
++	if (qdq->d_fieldmask & QC_INO_SOFT)
++		new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
++	if (qdq->d_fieldmask & QC_INO_HARD)
++		new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
++
++	ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int bch2_set_quota(struct super_block *sb, struct kqid qid,
++			  struct qc_dqblk *qdq)
++{
++	struct bch_fs *c = sb->s_fs_info;
++	struct bkey_i_quota new_quota;
++	int ret;
++
++	if (sb->s_flags & SB_RDONLY)
++		return -EROFS;
++
++	bkey_quota_init(&new_quota.k_i);
++	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
++
++	ret = bch2_trans_do(c, NULL, NULL, 0,
++			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
++		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
++
++	return ret;
++}
++
++const struct quotactl_ops bch2_quotactl_operations = {
++	.quota_enable		= bch2_quota_enable,
++	.quota_disable		= bch2_quota_disable,
++	.rm_xquota		= bch2_quota_remove,
++
++	.get_state		= bch2_quota_get_state,
++	.set_info		= bch2_quota_set_info,
++
++	.get_dqblk		= bch2_get_quota,
++	.get_nextdqblk		= bch2_get_next_quota,
++	.set_dqblk		= bch2_set_quota,
++};
++
++#endif /* CONFIG_BCACHEFS_QUOTA */
+diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
+new file mode 100644
+index 000000000000..8c67ae1da7c7
+--- /dev/null
++++ b/fs/bcachefs/quota.h
+@@ -0,0 +1,71 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_QUOTA_H
++#define _BCACHEFS_QUOTA_H
++
++#include "inode.h"
++#include "quota_types.h"
++
++extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
++
++int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++
++#define bch2_bkey_ops_quota (struct bkey_ops) {		\
++	.key_invalid	= bch2_quota_invalid,		\
++	.val_to_text	= bch2_quota_to_text,		\
++}
++
++static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
++{
++	return (struct bch_qid) {
++		.q[QTYP_USR] = u->bi_uid,
++		.q[QTYP_GRP] = u->bi_gid,
++		.q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
++	};
++}
++
++static inline unsigned enabled_qtypes(struct bch_fs *c)
++{
++	return ((c->opts.usrquota << QTYP_USR)|
++		(c->opts.grpquota << QTYP_GRP)|
++		(c->opts.prjquota << QTYP_PRJ));
++}
++
++#ifdef CONFIG_BCACHEFS_QUOTA
++
++int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
++		    s64, enum quota_acct_mode);
++
++int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
++			struct bch_qid, u64, enum quota_acct_mode);
++
++void bch2_fs_quota_exit(struct bch_fs *);
++void bch2_fs_quota_init(struct bch_fs *);
++int bch2_fs_quota_read(struct bch_fs *);
++
++extern const struct quotactl_ops bch2_quotactl_operations;
++
++#else
++
++static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
++				  enum quota_counters counter, s64 v,
++				  enum quota_acct_mode mode)
++{
++	return 0;
++}
++
++static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
++				      struct bch_qid dst,
++				      struct bch_qid src, u64 space,
++				      enum quota_acct_mode mode)
++{
++	return 0;
++}
++
++static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
++static inline void bch2_fs_quota_init(struct bch_fs *c) {}
++static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
++
++#endif
++
++#endif /* _BCACHEFS_QUOTA_H */
+diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
+new file mode 100644
+index 000000000000..6a136083d389
+--- /dev/null
++++ b/fs/bcachefs/quota_types.h
+@@ -0,0 +1,43 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_QUOTA_TYPES_H
++#define _BCACHEFS_QUOTA_TYPES_H
++
++#include <linux/generic-radix-tree.h>
++
++struct bch_qid {
++	u32		q[QTYP_NR];
++};
++
++enum quota_acct_mode {
++	KEY_TYPE_QUOTA_PREALLOC,
++	KEY_TYPE_QUOTA_WARN,
++	KEY_TYPE_QUOTA_NOCHECK,
++};
++
++struct memquota_counter {
++	u64				v;
++	u64				hardlimit;
++	u64				softlimit;
++	s64				timer;
++	int				warns;
++	int				warning_issued;
++};
++
++struct bch_memquota {
++	struct memquota_counter		c[Q_COUNTERS];
++};
++
++typedef GENRADIX(struct bch_memquota)	bch_memquota_table;
++
++struct quota_limit {
++	u32				timelimit;
++	u32				warnlimit;
++};
++
++struct bch_memquota_type {
++	struct quota_limit		limits[Q_COUNTERS];
++	bch_memquota_table		table;
++	struct mutex			lock;
++};
++
++#endif /* _BCACHEFS_QUOTA_TYPES_H */
+diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
+new file mode 100644
+index 000000000000..ecc64dd92b05
+--- /dev/null
++++ b/fs/bcachefs/rebalance.c
+@@ -0,0 +1,361 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "alloc_foreground.h"
++#include "btree_iter.h"
++#include "buckets.h"
++#include "clock.h"
++#include "disk_groups.h"
++#include "errcode.h"
++#include "extents.h"
++#include "io.h"
++#include "move.h"
++#include "rebalance.h"
++#include "super-io.h"
++
++#include <linux/freezer.h>
++#include <linux/kthread.h>
++#include <linux/sched/cputime.h>
++#include <trace/events/bcachefs.h>
++
++/*
++ * Check if an extent should be moved:
++ * returns -1 if it should not be moved, or
++ * device of pointer that should be moved, if known, or INT_MAX if unknown
++ */
++static bool rebalance_pred(struct bch_fs *c, void *arg,
++			   struct bkey_s_c k,
++			   struct bch_io_opts *io_opts,
++			   struct data_update_opts *data_opts)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	unsigned i;
++
++	data_opts->rewrite_ptrs		= 0;
++	data_opts->target		= io_opts->background_target;
++	data_opts->extra_replicas	= 0;
++	data_opts->btree_insert_flags	= 0;
++
++	if (io_opts->background_compression &&
++	    !bch2_bkey_is_incompressible(k)) {
++		const union bch_extent_entry *entry;
++		struct extent_ptr_decoded p;
++
++		i = 0;
++		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++			if (!p.ptr.cached &&
++			    p.crc.compression_type !=
++			    bch2_compression_opt_to_type[io_opts->background_compression])
++				data_opts->rewrite_ptrs |= 1U << i;
++			i++;
++		}
++	}
++
++	if (io_opts->background_target) {
++		const struct bch_extent_ptr *ptr;
++
++		i = 0;
++		bkey_for_each_ptr(ptrs, ptr) {
++			if (!ptr->cached &&
++			    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target))
++				data_opts->rewrite_ptrs |= 1U << i;
++			i++;
++		}
++	}
++
++	return data_opts->rewrite_ptrs != 0;
++}
++
++void bch2_rebalance_add_key(struct bch_fs *c,
++			    struct bkey_s_c k,
++			    struct bch_io_opts *io_opts)
++{
++	struct data_update_opts update_opts = { 0 };
++	struct bkey_ptrs_c ptrs;
++	const struct bch_extent_ptr *ptr;
++	unsigned i;
++
++	if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
++		return;
++
++	i = 0;
++	ptrs = bch2_bkey_ptrs_c(k);
++	bkey_for_each_ptr(ptrs, ptr) {
++		if ((1U << i) && update_opts.rewrite_ptrs)
++			if (atomic64_add_return(k.k->size,
++					&bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
++			    k.k->size)
++				rebalance_wakeup(c);
++		i++;
++	}
++}
++
++void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
++{
++	if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
++	    sectors)
++		rebalance_wakeup(c);
++}
++
++struct rebalance_work {
++	int		dev_most_full_idx;
++	unsigned	dev_most_full_percent;
++	u64		dev_most_full_work;
++	u64		dev_most_full_capacity;
++	u64		total_work;
++};
++
++static void rebalance_work_accumulate(struct rebalance_work *w,
++		u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
++{
++	unsigned percent_full;
++	u64 work = dev_work + unknown_dev;
++
++	if (work < dev_work || work < unknown_dev)
++		work = U64_MAX;
++	work = min(work, capacity);
++
++	percent_full = div64_u64(work * 100, capacity);
++
++	if (percent_full >= w->dev_most_full_percent) {
++		w->dev_most_full_idx		= idx;
++		w->dev_most_full_percent	= percent_full;
++		w->dev_most_full_work		= work;
++		w->dev_most_full_capacity	= capacity;
++	}
++
++	if (w->total_work + dev_work >= w->total_work &&
++	    w->total_work + dev_work >= dev_work)
++		w->total_work += dev_work;
++}
++
++static struct rebalance_work rebalance_work(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	struct rebalance_work ret = { .dev_most_full_idx = -1 };
++	u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
++	unsigned i;
++
++	for_each_online_member(ca, c, i)
++		rebalance_work_accumulate(&ret,
++			atomic64_read(&ca->rebalance_work),
++			unknown_dev,
++			bucket_to_sector(ca, ca->mi.nbuckets -
++					 ca->mi.first_bucket),
++			i);
++
++	rebalance_work_accumulate(&ret,
++		unknown_dev, 0, c->capacity, -1);
++
++	return ret;
++}
++
++static void rebalance_work_reset(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned i;
++
++	for_each_online_member(ca, c, i)
++		atomic64_set(&ca->rebalance_work, 0);
++
++	atomic64_set(&c->rebalance.work_unknown_dev, 0);
++}
++
++static unsigned long curr_cputime(void)
++{
++	u64 utime, stime;
++
++	task_cputime_adjusted(current, &utime, &stime);
++	return nsecs_to_jiffies(utime + stime);
++}
++
++static int bch2_rebalance_thread(void *arg)
++{
++	struct bch_fs *c = arg;
++	struct bch_fs_rebalance *r = &c->rebalance;
++	struct io_clock *clock = &c->io_clock[WRITE];
++	struct rebalance_work w, p;
++	struct bch_move_stats move_stats;
++	unsigned long start, prev_start;
++	unsigned long prev_run_time, prev_run_cputime;
++	unsigned long cputime, prev_cputime;
++	u64 io_start;
++	long throttle;
++
++	set_freezable();
++
++	io_start	= atomic64_read(&clock->now);
++	p		= rebalance_work(c);
++	prev_start	= jiffies;
++	prev_cputime	= curr_cputime();
++
++	bch_move_stats_init(&move_stats, "rebalance");
++	while (!kthread_wait_freezable(r->enabled)) {
++		cond_resched();
++
++		start			= jiffies;
++		cputime			= curr_cputime();
++
++		prev_run_time		= start - prev_start;
++		prev_run_cputime	= cputime - prev_cputime;
++
++		w			= rebalance_work(c);
++		BUG_ON(!w.dev_most_full_capacity);
++
++		if (!w.total_work) {
++			r->state = REBALANCE_WAITING;
++			kthread_wait_freezable(rebalance_work(c).total_work);
++			continue;
++		}
++
++		/*
++		 * If there isn't much work to do, throttle cpu usage:
++		 */
++		throttle = prev_run_cputime * 100 /
++			max(1U, w.dev_most_full_percent) -
++			prev_run_time;
++
++		if (w.dev_most_full_percent < 20 && throttle > 0) {
++			r->throttled_until_iotime = io_start +
++				div_u64(w.dev_most_full_capacity *
++					(20 - w.dev_most_full_percent),
++					50);
++
++			if (atomic64_read(&clock->now) + clock->max_slop <
++			    r->throttled_until_iotime) {
++				r->throttled_until_cputime = start + throttle;
++				r->state = REBALANCE_THROTTLED;
++
++				bch2_kthread_io_clock_wait(clock,
++					r->throttled_until_iotime,
++					throttle);
++				continue;
++			}
++		}
++
++		/* minimum 1 mb/sec: */
++		r->pd.rate.rate =
++			max_t(u64, 1 << 11,
++			      r->pd.rate.rate *
++			      max(p.dev_most_full_percent, 1U) /
++			      max(w.dev_most_full_percent, 1U));
++
++		io_start	= atomic64_read(&clock->now);
++		p		= w;
++		prev_start	= start;
++		prev_cputime	= cputime;
++
++		r->state = REBALANCE_RUNNING;
++		memset(&move_stats, 0, sizeof(move_stats));
++		rebalance_work_reset(c);
++
++		bch2_move_data(c,
++			       0,		POS_MIN,
++			       BTREE_ID_NR,	POS_MAX,
++			       /* ratelimiting disabled for now */
++			       NULL, /*  &r->pd.rate, */
++			       &move_stats,
++			       writepoint_ptr(&c->rebalance_write_point),
++			       true,
++			       rebalance_pred, NULL);
++	}
++
++	return 0;
++}
++
++void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	struct bch_fs_rebalance *r = &c->rebalance;
++	struct rebalance_work w = rebalance_work(c);
++
++	out->tabstops[0] = 20;
++
++	prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
++	prt_tab(out);
++
++	prt_human_readable_u64(out, w.dev_most_full_work << 9);
++	prt_printf(out, "/");
++	prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
++	prt_newline(out);
++
++	prt_printf(out, "total work:");
++	prt_tab(out);
++
++	prt_human_readable_u64(out, w.total_work << 9);
++	prt_printf(out, "/");
++	prt_human_readable_u64(out, c->capacity << 9);
++	prt_newline(out);
++
++	prt_printf(out, "rate:");
++	prt_tab(out);
++	prt_printf(out, "%u", r->pd.rate.rate);
++	prt_newline(out);
++
++	switch (r->state) {
++	case REBALANCE_WAITING:
++		prt_printf(out, "waiting");
++		break;
++	case REBALANCE_THROTTLED:
++		prt_printf(out, "throttled for %lu sec or ",
++		       (r->throttled_until_cputime - jiffies) / HZ);
++		prt_human_readable_u64(out,
++			    (r->throttled_until_iotime -
++			     atomic64_read(&c->io_clock[WRITE].now)) << 9);
++		prt_printf(out, " io");
++		break;
++	case REBALANCE_RUNNING:
++		prt_printf(out, "running");
++		break;
++	}
++	prt_newline(out);
++}
++
++void bch2_rebalance_stop(struct bch_fs *c)
++{
++	struct task_struct *p;
++
++	c->rebalance.pd.rate.rate = UINT_MAX;
++	bch2_ratelimit_reset(&c->rebalance.pd.rate);
++
++	p = rcu_dereference_protected(c->rebalance.thread, 1);
++	c->rebalance.thread = NULL;
++
++	if (p) {
++		/* for sychronizing with rebalance_wakeup() */
++		synchronize_rcu();
++
++		kthread_stop(p);
++		put_task_struct(p);
++	}
++}
++
++int bch2_rebalance_start(struct bch_fs *c)
++{
++	struct task_struct *p;
++	int ret;
++
++	if (c->rebalance.thread)
++		return 0;
++
++	if (c->opts.nochanges)
++		return 0;
++
++	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
++	ret = PTR_ERR_OR_ZERO(p);
++	if (ret) {
++		bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
++		return ret;
++	}
++
++	get_task_struct(p);
++	rcu_assign_pointer(c->rebalance.thread, p);
++	wake_up_process(p);
++	return 0;
++}
++
++void bch2_fs_rebalance_init(struct bch_fs *c)
++{
++	bch2_pd_controller_init(&c->rebalance.pd);
++
++	atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
++}
+diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
+new file mode 100644
+index 000000000000..7ade0bb81cce
+--- /dev/null
++++ b/fs/bcachefs/rebalance.h
+@@ -0,0 +1,28 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_REBALANCE_H
++#define _BCACHEFS_REBALANCE_H
++
++#include "rebalance_types.h"
++
++static inline void rebalance_wakeup(struct bch_fs *c)
++{
++	struct task_struct *p;
++
++	rcu_read_lock();
++	p = rcu_dereference(c->rebalance.thread);
++	if (p)
++		wake_up_process(p);
++	rcu_read_unlock();
++}
++
++void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
++			    struct bch_io_opts *);
++void bch2_rebalance_add_work(struct bch_fs *, u64);
++
++void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
++
++void bch2_rebalance_stop(struct bch_fs *);
++int bch2_rebalance_start(struct bch_fs *);
++void bch2_fs_rebalance_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_REBALANCE_H */
+diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
+new file mode 100644
+index 000000000000..7462a92e9598
+--- /dev/null
++++ b/fs/bcachefs/rebalance_types.h
+@@ -0,0 +1,26 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_REBALANCE_TYPES_H
++#define _BCACHEFS_REBALANCE_TYPES_H
++
++#include "move_types.h"
++
++enum rebalance_state {
++	REBALANCE_WAITING,
++	REBALANCE_THROTTLED,
++	REBALANCE_RUNNING,
++};
++
++struct bch_fs_rebalance {
++	struct task_struct __rcu *thread;
++	struct bch_pd_controller pd;
++
++	atomic64_t		work_unknown_dev;
++
++	enum rebalance_state	state;
++	u64			throttled_until_iotime;
++	unsigned long		throttled_until_cputime;
++
++	unsigned		enabled:1;
++};
++
++#endif /* _BCACHEFS_REBALANCE_TYPES_H */
+diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
+new file mode 100644
+index 000000000000..b070bdf01500
+--- /dev/null
++++ b/fs/bcachefs/recovery.c
+@@ -0,0 +1,1597 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "backpointers.h"
++#include "bkey_buf.h"
++#include "alloc_background.h"
++#include "btree_gc.h"
++#include "btree_update.h"
++#include "btree_update_interior.h"
++#include "btree_io.h"
++#include "buckets.h"
++#include "dirent.h"
++#include "ec.h"
++#include "errcode.h"
++#include "error.h"
++#include "fs-common.h"
++#include "fsck.h"
++#include "journal_io.h"
++#include "journal_reclaim.h"
++#include "journal_seq_blacklist.h"
++#include "lru.h"
++#include "move.h"
++#include "quota.h"
++#include "recovery.h"
++#include "replicas.h"
++#include "subvolume.h"
++#include "super-io.h"
++
++#include <linux/sort.h>
++#include <linux/stat.h>
++
++#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
++
++/* for -o reconstruct_alloc: */
++static void drop_alloc_keys(struct journal_keys *keys)
++{
++	size_t src, dst;
++
++	for (src = 0, dst = 0; src < keys->nr; src++)
++		if (keys->d[src].btree_id != BTREE_ID_alloc)
++			keys->d[dst++] = keys->d[src];
++
++	keys->nr = dst;
++}
++
++/*
++ * Btree node pointers have a field to stack a pointer to the in memory btree
++ * node; we need to zero out this field when reading in btree nodes, or when
++ * reading in keys from the journal:
++ */
++static void zero_out_btree_mem_ptr(struct journal_keys *keys)
++{
++	struct journal_key *i;
++
++	for (i = keys->d; i < keys->d + keys->nr; i++)
++		if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
++			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
++}
++
++/* iterate over keys read from the journal: */
++
++static int __journal_key_cmp(enum btree_id	l_btree_id,
++			     unsigned		l_level,
++			     struct bpos	l_pos,
++			     const struct journal_key *r)
++{
++	return (cmp_int(l_btree_id,	r->btree_id) ?:
++		cmp_int(l_level,	r->level) ?:
++		bpos_cmp(l_pos,	r->k->k.p));
++}
++
++static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
++{
++	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
++}
++
++static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
++{
++	size_t gap_size = keys->size - keys->nr;
++
++	if (idx >= keys->gap)
++		idx += gap_size;
++	return idx;
++}
++
++static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
++{
++	return keys->d + idx_to_pos(keys, idx);
++}
++
++static size_t __bch2_journal_key_search(struct journal_keys *keys,
++					enum btree_id id, unsigned level,
++					struct bpos pos)
++{
++	size_t l = 0, r = keys->nr, m;
++
++	while (l < r) {
++		m = l + ((r - l) >> 1);
++		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
++			l = m + 1;
++		else
++			r = m;
++	}
++
++	BUG_ON(l < keys->nr &&
++	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
++
++	BUG_ON(l &&
++	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
++
++	return l;
++}
++
++static size_t bch2_journal_key_search(struct journal_keys *keys,
++				      enum btree_id id, unsigned level,
++				      struct bpos pos)
++{
++	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
++}
++
++struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
++					   unsigned level, struct bpos pos,
++					   struct bpos end_pos, size_t *idx)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	unsigned iters = 0;
++	struct journal_key *k;
++search:
++	if (!*idx)
++		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
++
++	while (*idx < keys->nr &&
++	       (k = idx_to_key(keys, *idx),
++		k->btree_id == btree_id &&
++		k->level == level &&
++		bpos_cmp(k->k->k.p, end_pos) <= 0)) {
++		if (bpos_cmp(k->k->k.p, pos) >= 0 &&
++		    !k->overwritten)
++			return k->k;
++
++		(*idx)++;
++		iters++;
++		if (iters == 10) {
++			*idx = 0;
++			goto search;
++		}
++	}
++
++	return NULL;
++}
++
++struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
++					   unsigned level, struct bpos pos)
++{
++	size_t idx = 0;
++
++	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
++}
++
++static void journal_iters_fix(struct bch_fs *c)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	/* The key we just inserted is immediately before the gap: */
++	size_t gap_end = keys->gap + (keys->size - keys->nr);
++	struct btree_and_journal_iter *iter;
++
++	/*
++	 * If an iterator points one after the key we just inserted, decrement
++	 * the iterator so it points at the key we just inserted - if the
++	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
++	 * handle that:
++	 */
++	list_for_each_entry(iter, &c->journal_iters, journal.list)
++		if (iter->journal.idx == gap_end)
++			iter->journal.idx = keys->gap - 1;
++}
++
++static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	struct journal_iter *iter;
++	size_t gap_size = keys->size - keys->nr;
++
++	list_for_each_entry(iter, &c->journal_iters, list) {
++		if (iter->idx > old_gap)
++			iter->idx -= gap_size;
++		if (iter->idx >= new_gap)
++			iter->idx += gap_size;
++	}
++}
++
++int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
++				 unsigned level, struct bkey_i *k)
++{
++	struct journal_key n = {
++		.btree_id	= id,
++		.level		= level,
++		.k		= k,
++		.allocated	= true,
++		/*
++		 * Ensure these keys are done last by journal replay, to unblock
++		 * journal reclaim:
++		 */
++		.journal_seq	= U32_MAX,
++	};
++	struct journal_keys *keys = &c->journal_keys;
++	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
++
++	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
++
++	if (idx < keys->size &&
++	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
++		if (keys->d[idx].allocated)
++			kfree(keys->d[idx].k);
++		keys->d[idx] = n;
++		return 0;
++	}
++
++	if (idx > keys->gap)
++		idx -= keys->size - keys->nr;
++
++	if (keys->nr == keys->size) {
++		struct journal_keys new_keys = {
++			.nr			= keys->nr,
++			.size			= max_t(size_t, keys->size, 8) * 2,
++			.journal_seq_base	= keys->journal_seq_base,
++		};
++
++		new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
++		if (!new_keys.d) {
++			bch_err(c, "%s: error allocating new key array (size %zu)",
++				__func__, new_keys.size);
++			return -ENOMEM;
++		}
++
++		/* Since @keys was full, there was no gap: */
++		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
++		kvfree(keys->d);
++		*keys = new_keys;
++
++		/* And now the gap is at the end: */
++		keys->gap = keys->nr;
++	}
++
++	journal_iters_move_gap(c, keys->gap, idx);
++
++	move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
++	keys->gap = idx;
++
++	keys->nr++;
++	keys->d[keys->gap++] = n;
++
++	journal_iters_fix(c);
++
++	return 0;
++}
++
++/*
++ * Can only be used from the recovery thread while we're still RO - can't be
++ * used once we've got RW, as journal_keys is at that point used by multiple
++ * threads:
++ */
++int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
++			    unsigned level, struct bkey_i *k)
++{
++	struct bkey_i *n;
++	int ret;
++
++	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
++	if (!n)
++		return -ENOMEM;
++
++	bkey_copy(n, k);
++	ret = bch2_journal_key_insert_take(c, id, level, n);
++	if (ret)
++		kfree(n);
++	return ret;
++}
++
++int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
++			    unsigned level, struct bpos pos)
++{
++	struct bkey_i whiteout;
++
++	bkey_init(&whiteout.k);
++	whiteout.k.p = pos;
++
++	return bch2_journal_key_insert(c, id, level, &whiteout);
++}
++
++void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
++				  unsigned level, struct bpos pos)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
++
++	if (idx < keys->size &&
++	    keys->d[idx].btree_id	== btree &&
++	    keys->d[idx].level		== level &&
++	    !bpos_cmp(keys->d[idx].k->k.p, pos))
++		keys->d[idx].overwritten = true;
++}
++
++static void bch2_journal_iter_advance(struct journal_iter *iter)
++{
++	if (iter->idx < iter->keys->size) {
++		iter->idx++;
++		if (iter->idx == iter->keys->gap)
++			iter->idx += iter->keys->size - iter->keys->nr;
++	}
++}
++
++struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
++{
++	struct journal_key *k = iter->keys->d + iter->idx;
++
++	while (k < iter->keys->d + iter->keys->size &&
++	       k->btree_id	== iter->btree_id &&
++	       k->level		== iter->level) {
++		if (!k->overwritten)
++			return bkey_i_to_s_c(k->k);
++
++		bch2_journal_iter_advance(iter);
++		k = iter->keys->d + iter->idx;
++	}
++
++	return bkey_s_c_null;
++}
++
++static void bch2_journal_iter_exit(struct journal_iter *iter)
++{
++	list_del(&iter->list);
++}
++
++static void bch2_journal_iter_init(struct bch_fs *c,
++				   struct journal_iter *iter,
++				   enum btree_id id, unsigned level,
++				   struct bpos pos)
++{
++	iter->btree_id	= id;
++	iter->level	= level;
++	iter->keys	= &c->journal_keys;
++	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
++}
++
++static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
++{
++	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
++						iter->b, &iter->unpacked);
++}
++
++static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
++{
++	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
++}
++
++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
++{
++	if (!bpos_cmp(iter->pos, SPOS_MAX))
++		iter->at_end = true;
++	else
++		iter->pos = bpos_successor(iter->pos);
++}
++
++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
++{
++	struct bkey_s_c btree_k, journal_k, ret;
++again:
++	if (iter->at_end)
++		return bkey_s_c_null;
++
++	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
++	       bpos_cmp(btree_k.k->p, iter->pos) < 0)
++		bch2_journal_iter_advance_btree(iter);
++
++	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
++	       bpos_cmp(journal_k.k->p, iter->pos) < 0)
++		bch2_journal_iter_advance(&iter->journal);
++
++	ret = journal_k.k &&
++		(!btree_k.k || bpos_cmp(journal_k.k->p, btree_k.k->p) <= 0)
++		? journal_k
++		: btree_k;
++
++	if (ret.k && iter->b && bpos_cmp(ret.k->p, iter->b->data->max_key) > 0)
++		ret = bkey_s_c_null;
++
++	if (ret.k) {
++		iter->pos = ret.k->p;
++		if (bkey_deleted(ret.k)) {
++			bch2_btree_and_journal_iter_advance(iter);
++			goto again;
++		}
++	} else {
++		iter->pos = SPOS_MAX;
++		iter->at_end = true;
++	}
++
++	return ret;
++}
++
++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
++{
++	bch2_journal_iter_exit(&iter->journal);
++}
++
++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
++						  struct bch_fs *c,
++						  struct btree *b,
++						  struct btree_node_iter node_iter,
++						  struct bpos pos)
++{
++	memset(iter, 0, sizeof(*iter));
++
++	iter->b = b;
++	iter->node_iter = node_iter;
++	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
++	INIT_LIST_HEAD(&iter->journal.list);
++	iter->pos = b->data->min_key;
++	iter->at_end = false;
++}
++
++/*
++ * this version is used by btree_gc before filesystem has gone RW and
++ * multithreaded, so uses the journal_iters list:
++ */
++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
++						struct bch_fs *c,
++						struct btree *b)
++{
++	struct btree_node_iter node_iter;
++
++	bch2_btree_node_iter_init_from_start(&node_iter, b);
++	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
++	list_add(&iter->journal.list, &c->journal_iters);
++}
++
++/* sort and dedup all keys in the journal: */
++
++void bch2_journal_entries_free(struct bch_fs *c)
++{
++	struct journal_replay **i;
++	struct genradix_iter iter;
++
++	genradix_for_each(&c->journal_entries, iter, i)
++		if (*i)
++			kvpfree(*i, offsetof(struct journal_replay, j) +
++				vstruct_bytes(&(*i)->j));
++	genradix_free(&c->journal_entries);
++}
++
++/*
++ * When keys compare equal, oldest compares first:
++ */
++static int journal_sort_key_cmp(const void *_l, const void *_r)
++{
++	const struct journal_key *l = _l;
++	const struct journal_key *r = _r;
++
++	return  journal_key_cmp(l, r) ?:
++		cmp_int(l->journal_seq, r->journal_seq) ?:
++		cmp_int(l->journal_offset, r->journal_offset);
++}
++
++void bch2_journal_keys_free(struct journal_keys *keys)
++{
++	struct journal_key *i;
++
++	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
++	keys->gap = keys->nr;
++
++	for (i = keys->d; i < keys->d + keys->nr; i++)
++		if (i->allocated)
++			kfree(i->k);
++
++	kvfree(keys->d);
++	keys->d = NULL;
++	keys->nr = keys->gap = keys->size = 0;
++}
++
++static int journal_keys_sort(struct bch_fs *c)
++{
++	struct genradix_iter iter;
++	struct journal_replay *i, **_i;
++	struct jset_entry *entry;
++	struct bkey_i *k, *_n;
++	struct journal_keys *keys = &c->journal_keys;
++	struct journal_key *src, *dst;
++	size_t nr_keys = 0;
++
++	genradix_for_each(&c->journal_entries, iter, _i) {
++		i = *_i;
++
++		if (!i || i->ignore)
++			continue;
++
++		if (!keys->journal_seq_base)
++			keys->journal_seq_base = le64_to_cpu(i->j.seq);
++
++		for_each_jset_key(k, _n, entry, &i->j)
++			nr_keys++;
++	}
++
++	if (!nr_keys)
++		return 0;
++
++	keys->size = roundup_pow_of_two(nr_keys);
++
++	keys->d = kvmalloc(sizeof(keys->d[0]) * keys->size, GFP_KERNEL);
++	if (!keys->d)
++		return -ENOMEM;
++
++	genradix_for_each(&c->journal_entries, iter, _i) {
++		i = *_i;
++
++		if (!i || i->ignore)
++			continue;
++
++		BUG_ON(le64_to_cpu(i->j.seq) - keys->journal_seq_base > U32_MAX);
++
++		for_each_jset_key(k, _n, entry, &i->j)
++			keys->d[keys->nr++] = (struct journal_key) {
++				.btree_id	= entry->btree_id,
++				.level		= entry->level,
++				.k		= k,
++				.journal_seq	= le64_to_cpu(i->j.seq) -
++					keys->journal_seq_base,
++				.journal_offset	= k->_data - i->j._data,
++			};
++	}
++
++	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
++
++	src = dst = keys->d;
++	while (src < keys->d + keys->nr) {
++		while (src + 1 < keys->d + keys->nr &&
++		       src[0].btree_id	== src[1].btree_id &&
++		       src[0].level	== src[1].level &&
++		       !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
++			src++;
++
++		*dst++ = *src++;
++	}
++
++	keys->nr = dst - keys->d;
++	keys->gap = keys->nr;
++	return 0;
++}
++
++/* journal replay: */
++
++static void replay_now_at(struct journal *j, u64 seq)
++{
++	BUG_ON(seq < j->replay_journal_seq);
++
++	seq = min(seq, j->replay_journal_seq_end);
++
++	while (j->replay_journal_seq < seq)
++		bch2_journal_pin_put(j, j->replay_journal_seq++);
++}
++
++static int bch2_journal_replay_key(struct btree_trans *trans,
++				   struct journal_key *k)
++{
++	struct btree_iter iter;
++	unsigned iter_flags =
++		BTREE_ITER_INTENT|
++		BTREE_ITER_NOT_EXTENTS;
++	int ret;
++
++	if (!k->level && k->btree_id == BTREE_ID_alloc)
++		iter_flags |= BTREE_ITER_CACHED;
++
++	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
++				  BTREE_MAX_DEPTH, k->level,
++				  iter_flags);
++	ret = bch2_btree_iter_traverse(&iter);
++	if (ret)
++		goto out;
++
++	/* Must be checked with btree locked: */
++	if (k->overwritten)
++		goto out;
++
++	ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int journal_sort_seq_cmp(const void *_l, const void *_r)
++{
++	const struct journal_key *l = *((const struct journal_key **)_l);
++	const struct journal_key *r = *((const struct journal_key **)_r);
++
++	return cmp_int(l->journal_seq, r->journal_seq);
++}
++
++static int bch2_journal_replay(struct bch_fs *c)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	struct journal_key **keys_sorted, *k;
++	struct journal *j = &c->journal;
++	size_t i;
++	int ret;
++
++	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
++	keys->gap = keys->nr;
++
++	keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
++	if (!keys_sorted)
++		return -ENOMEM;
++
++	for (i = 0; i < keys->nr; i++)
++		keys_sorted[i] = &keys->d[i];
++
++	sort(keys_sorted, keys->nr,
++	     sizeof(keys_sorted[0]),
++	     journal_sort_seq_cmp, NULL);
++
++	if (keys->nr)
++		replay_now_at(j, keys->journal_seq_base);
++
++	for (i = 0; i < keys->nr; i++) {
++		k = keys_sorted[i];
++
++		cond_resched();
++
++		replay_now_at(j, keys->journal_seq_base + k->journal_seq);
++
++		ret = bch2_trans_do(c, NULL, NULL,
++				    BTREE_INSERT_LAZY_RW|
++				    BTREE_INSERT_NOFAIL|
++				    (!k->allocated
++				     ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved
++				     : 0),
++			     bch2_journal_replay_key(&trans, k));
++		if (ret) {
++			bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
++				ret, bch2_btree_ids[k->btree_id], k->level);
++			goto err;
++		}
++	}
++
++	replay_now_at(j, j->replay_journal_seq_end);
++	j->replay_journal_seq = 0;
++
++	bch2_journal_set_replay_done(j);
++	bch2_journal_flush_all_pins(j);
++	ret = bch2_journal_error(j);
++
++	if (keys->nr && !ret)
++		bch2_journal_log_msg(&c->journal, "journal replay finished");
++err:
++	kvfree(keys_sorted);
++	return ret;
++}
++
++/* journal replay early: */
++
++static int journal_replay_entry_early(struct bch_fs *c,
++				      struct jset_entry *entry)
++{
++	int ret = 0;
++
++	switch (entry->type) {
++	case BCH_JSET_ENTRY_btree_root: {
++		struct btree_root *r;
++
++		if (entry->btree_id >= BTREE_ID_NR) {
++			bch_err(c, "filesystem has unknown btree type %u",
++				entry->btree_id);
++			return -EINVAL;
++		}
++
++		r = &c->btree_roots[entry->btree_id];
++
++		if (entry->u64s) {
++			r->level = entry->level;
++			bkey_copy(&r->key, &entry->start[0]);
++			r->error = 0;
++		} else {
++			r->error = -EIO;
++		}
++		r->alive = true;
++		break;
++	}
++	case BCH_JSET_ENTRY_usage: {
++		struct jset_entry_usage *u =
++			container_of(entry, struct jset_entry_usage, entry);
++
++		switch (entry->btree_id) {
++		case BCH_FS_USAGE_reserved:
++			if (entry->level < BCH_REPLICAS_MAX)
++				c->usage_base->persistent_reserved[entry->level] =
++					le64_to_cpu(u->v);
++			break;
++		case BCH_FS_USAGE_inodes:
++			c->usage_base->nr_inodes = le64_to_cpu(u->v);
++			break;
++		case BCH_FS_USAGE_key_version:
++			atomic64_set(&c->key_version,
++				     le64_to_cpu(u->v));
++			break;
++		}
++
++		break;
++	}
++	case BCH_JSET_ENTRY_data_usage: {
++		struct jset_entry_data_usage *u =
++			container_of(entry, struct jset_entry_data_usage, entry);
++
++		ret = bch2_replicas_set_usage(c, &u->r,
++					      le64_to_cpu(u->v));
++		break;
++	}
++	case BCH_JSET_ENTRY_dev_usage: {
++		struct jset_entry_dev_usage *u =
++			container_of(entry, struct jset_entry_dev_usage, entry);
++		struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
++		unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
++
++		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
++
++		for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
++			ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
++			ca->usage_base->d[i].sectors	= le64_to_cpu(u->d[i].sectors);
++			ca->usage_base->d[i].fragmented	= le64_to_cpu(u->d[i].fragmented);
++		}
++
++		break;
++	}
++	case BCH_JSET_ENTRY_blacklist: {
++		struct jset_entry_blacklist *bl_entry =
++			container_of(entry, struct jset_entry_blacklist, entry);
++
++		ret = bch2_journal_seq_blacklist_add(c,
++				le64_to_cpu(bl_entry->seq),
++				le64_to_cpu(bl_entry->seq) + 1);
++		break;
++	}
++	case BCH_JSET_ENTRY_blacklist_v2: {
++		struct jset_entry_blacklist_v2 *bl_entry =
++			container_of(entry, struct jset_entry_blacklist_v2, entry);
++
++		ret = bch2_journal_seq_blacklist_add(c,
++				le64_to_cpu(bl_entry->start),
++				le64_to_cpu(bl_entry->end) + 1);
++		break;
++	}
++	case BCH_JSET_ENTRY_clock: {
++		struct jset_entry_clock *clock =
++			container_of(entry, struct jset_entry_clock, entry);
++
++		atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
++	}
++	}
++
++	return ret;
++}
++
++static int journal_replay_early(struct bch_fs *c,
++				struct bch_sb_field_clean *clean)
++{
++	struct jset_entry *entry;
++	int ret;
++
++	if (clean) {
++		for (entry = clean->start;
++		     entry != vstruct_end(&clean->field);
++		     entry = vstruct_next(entry)) {
++			ret = journal_replay_entry_early(c, entry);
++			if (ret)
++				return ret;
++		}
++	} else {
++		struct genradix_iter iter;
++		struct journal_replay *i, **_i;
++
++		genradix_for_each(&c->journal_entries, iter, _i) {
++			i = *_i;
++
++			if (!i || i->ignore)
++				continue;
++
++			vstruct_for_each(&i->j, entry) {
++				ret = journal_replay_entry_early(c, entry);
++				if (ret)
++					return ret;
++			}
++		}
++	}
++
++	bch2_fs_usage_initialize(c);
++
++	return 0;
++}
++
++/* sb clean section: */
++
++static struct bkey_i *btree_root_find(struct bch_fs *c,
++				      struct bch_sb_field_clean *clean,
++				      struct jset *j,
++				      enum btree_id id, unsigned *level)
++{
++	struct bkey_i *k;
++	struct jset_entry *entry, *start, *end;
++
++	if (clean) {
++		start = clean->start;
++		end = vstruct_end(&clean->field);
++	} else {
++		start = j->start;
++		end = vstruct_last(j);
++	}
++
++	for (entry = start; entry < end; entry = vstruct_next(entry))
++		if (entry->type == BCH_JSET_ENTRY_btree_root &&
++		    entry->btree_id == id)
++			goto found;
++
++	return NULL;
++found:
++	if (!entry->u64s)
++		return ERR_PTR(-EINVAL);
++
++	k = entry->start;
++	*level = entry->level;
++	return k;
++}
++
++static int verify_superblock_clean(struct bch_fs *c,
++				   struct bch_sb_field_clean **cleanp,
++				   struct jset *j)
++{
++	unsigned i;
++	struct bch_sb_field_clean *clean = *cleanp;
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++	int ret = 0;
++
++	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
++			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
++			le64_to_cpu(clean->journal_seq),
++			le64_to_cpu(j->seq))) {
++		kfree(clean);
++		*cleanp = NULL;
++		return 0;
++	}
++
++	for (i = 0; i < BTREE_ID_NR; i++) {
++		struct bkey_i *k1, *k2;
++		unsigned l1 = 0, l2 = 0;
++
++		k1 = btree_root_find(c, clean, NULL, i, &l1);
++		k2 = btree_root_find(c, NULL, j, i, &l2);
++
++		if (!k1 && !k2)
++			continue;
++
++		printbuf_reset(&buf1);
++		printbuf_reset(&buf2);
++
++		if (k1)
++			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
++		else
++			prt_printf(&buf1, "(none)");
++
++		if (k2)
++			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
++		else
++			prt_printf(&buf2, "(none)");
++
++		mustfix_fsck_err_on(!k1 || !k2 ||
++				    IS_ERR(k1) ||
++				    IS_ERR(k2) ||
++				    k1->k.u64s != k2->k.u64s ||
++				    memcmp(k1, k2, bkey_bytes(k1)) ||
++				    l1 != l2, c,
++			"superblock btree root %u doesn't match journal after clean shutdown\n"
++			"sb:      l=%u %s\n"
++			"journal: l=%u %s\n", i,
++			l1, buf1.buf,
++			l2, buf2.buf);
++	}
++fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
++	return ret;
++}
++
++static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
++{
++	struct bch_sb_field_clean *clean, *sb_clean;
++	int ret;
++
++	mutex_lock(&c->sb_lock);
++	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
++
++	if (fsck_err_on(!sb_clean, c,
++			"superblock marked clean but clean section not present")) {
++		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
++		c->sb.clean = false;
++		mutex_unlock(&c->sb_lock);
++		return NULL;
++	}
++
++	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
++			GFP_KERNEL);
++	if (!clean) {
++		mutex_unlock(&c->sb_lock);
++		return ERR_PTR(-ENOMEM);
++	}
++
++	ret = bch2_sb_clean_validate_late(c, clean, READ);
++	if (ret) {
++		mutex_unlock(&c->sb_lock);
++		return ERR_PTR(ret);
++	}
++
++	mutex_unlock(&c->sb_lock);
++
++	return clean;
++fsck_err:
++	mutex_unlock(&c->sb_lock);
++	return ERR_PTR(ret);
++}
++
++static bool btree_id_is_alloc(enum btree_id id)
++{
++	switch (id) {
++	case BTREE_ID_alloc:
++	case BTREE_ID_backpointers:
++	case BTREE_ID_need_discard:
++	case BTREE_ID_freespace:
++		return true;
++	default:
++		return false;
++	}
++}
++
++static int read_btree_roots(struct bch_fs *c)
++{
++	unsigned i;
++	int ret = 0;
++
++	for (i = 0; i < BTREE_ID_NR; i++) {
++		struct btree_root *r = &c->btree_roots[i];
++
++		if (!r->alive)
++			continue;
++
++		if (btree_id_is_alloc(i) &&
++		    c->opts.reconstruct_alloc) {
++			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
++			continue;
++		}
++
++		if (r->error) {
++			__fsck_err(c, btree_id_is_alloc(i)
++				   ? FSCK_CAN_IGNORE : 0,
++				   "invalid btree root %s",
++				   bch2_btree_ids[i]);
++			if (i == BTREE_ID_alloc)
++				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
++		}
++
++		ret = bch2_btree_root_read(c, i, &r->key, r->level);
++		if (ret) {
++			__fsck_err(c,
++				   btree_id_is_alloc(i)
++				   ? FSCK_CAN_IGNORE : 0,
++				   "error reading btree root %s",
++				   bch2_btree_ids[i]);
++			if (i == BTREE_ID_alloc)
++				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
++		}
++	}
++
++	for (i = 0; i < BTREE_ID_NR; i++)
++		if (!c->btree_roots[i].b)
++			bch2_btree_root_alloc(c, i);
++fsck_err:
++	return ret;
++}
++
++static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
++{
++	struct bkey_i_snapshot	root_snapshot;
++	struct bkey_i_subvolume root_volume;
++	int ret;
++
++	bkey_snapshot_init(&root_snapshot.k_i);
++	root_snapshot.k.p.offset = U32_MAX;
++	root_snapshot.v.flags	= 0;
++	root_snapshot.v.parent	= 0;
++	root_snapshot.v.subvol	= BCACHEFS_ROOT_SUBVOL;
++	root_snapshot.v.pad	= 0;
++	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
++
++	ret = bch2_btree_insert(c, BTREE_ID_snapshots,
++				&root_snapshot.k_i,
++				NULL, NULL, 0);
++	if (ret)
++		return ret;
++
++	bkey_subvolume_init(&root_volume.k_i);
++	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
++	root_volume.v.flags	= 0;
++	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
++	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
++
++	ret = bch2_btree_insert(c, BTREE_ID_subvolumes,
++				&root_volume.k_i,
++				NULL, NULL, 0);
++	if (ret)
++		return ret;
++
++	return 0;
++}
++
++static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_inode_unpacked inode;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
++			     SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (!bkey_is_inode(k.k)) {
++		bch_err(trans->c, "root inode not found");
++		ret = -ENOENT;
++		goto err;
++	}
++
++	ret = bch2_inode_unpack(k, &inode);
++	BUG_ON(ret);
++
++	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
++
++	ret = bch2_inode_write(trans, &iter, &inode);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_fs_recovery(struct bch_fs *c)
++{
++	const char *err = "cannot allocate memory";
++	struct bch_sb_field_clean *clean = NULL;
++	struct jset *last_journal_entry = NULL;
++	u64 blacklist_seq, journal_seq;
++	bool write_sb = false;
++	int ret = 0;
++
++	if (c->sb.clean)
++		clean = read_superblock_clean(c);
++	ret = PTR_ERR_OR_ZERO(clean);
++	if (ret)
++		goto err;
++
++	if (c->sb.clean)
++		bch_info(c, "recovering from clean shutdown, journal seq %llu",
++			 le64_to_cpu(clean->journal_seq));
++	else
++		bch_info(c, "recovering from unclean shutdown");
++
++	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
++		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
++		ret = -EINVAL;
++		goto err;
++	}
++
++	if (!c->sb.clean &&
++	    !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
++		bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
++		ret = -EINVAL;
++		goto err;
++	}
++
++	if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
++		bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
++		ret = -EINVAL;
++		goto err;
++	}
++
++	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
++		bch_info(c, "alloc_v2 feature bit not set, fsck required");
++		c->opts.fsck = true;
++		c->opts.fix_errors = FSCK_OPT_YES;
++	}
++
++	if (!c->opts.nochanges) {
++		if (c->sb.version < bcachefs_metadata_version_backpointers) {
++			bch_info(c, "version prior to backpointers, upgrade and fsck required");
++			c->opts.version_upgrade	= true;
++			c->opts.fsck		= true;
++			c->opts.fix_errors	= FSCK_OPT_YES;
++		}
++	}
++
++	if (c->opts.fsck && c->opts.norecovery) {
++		bch_err(c, "cannot select both norecovery and fsck");
++		ret = -EINVAL;
++		goto err;
++	}
++
++	ret = bch2_blacklist_table_initialize(c);
++	if (ret) {
++		bch_err(c, "error initializing blacklist table");
++		goto err;
++	}
++
++	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
++		struct genradix_iter iter;
++		struct journal_replay **i;
++
++		bch_verbose(c, "starting journal read");
++		ret = bch2_journal_read(c, &blacklist_seq, &journal_seq);
++		if (ret)
++			goto err;
++
++		genradix_for_each_reverse(&c->journal_entries, iter, i)
++			if (*i && !(*i)->ignore) {
++				last_journal_entry = &(*i)->j;
++				break;
++			}
++
++		if (mustfix_fsck_err_on(c->sb.clean &&
++					last_journal_entry &&
++					!journal_entry_empty(last_journal_entry), c,
++				"filesystem marked clean but journal not empty")) {
++			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
++			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
++			c->sb.clean = false;
++		}
++
++		if (!last_journal_entry) {
++			fsck_err_on(!c->sb.clean, c, "no journal entries found");
++			goto use_clean;
++		}
++
++		ret = journal_keys_sort(c);
++		if (ret)
++			goto err;
++
++		if (c->sb.clean && last_journal_entry) {
++			ret = verify_superblock_clean(c, &clean,
++						      last_journal_entry);
++			if (ret)
++				goto err;
++		}
++	} else {
++use_clean:
++		if (!clean) {
++			bch_err(c, "no superblock clean section found");
++			ret = -BCH_ERR_fsck_repair_impossible;
++			goto err;
++
++		}
++		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
++	}
++
++	if (c->opts.reconstruct_alloc) {
++		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
++		drop_alloc_keys(&c->journal_keys);
++	}
++
++	zero_out_btree_mem_ptr(&c->journal_keys);
++
++	ret = journal_replay_early(c, clean);
++	if (ret)
++		goto err;
++
++	/*
++	 * After an unclean shutdown, skip then next few journal sequence
++	 * numbers as they may have been referenced by btree writes that
++	 * happened before their corresponding journal writes - those btree
++	 * writes need to be ignored, by skipping and blacklisting the next few
++	 * journal sequence numbers:
++	 */
++	if (!c->sb.clean)
++		journal_seq += 8;
++
++	if (blacklist_seq != journal_seq) {
++		ret = bch2_journal_seq_blacklist_add(c,
++					blacklist_seq, journal_seq);
++		if (ret) {
++			bch_err(c, "error creating new journal seq blacklist entry");
++			goto err;
++		}
++	}
++
++	/*
++	 * note: cmd_list_journal needs the blacklist table fully up to date so
++	 * it can asterisk ignored journal entries:
++	 */
++	if (c->opts.read_journal_only)
++		goto out;
++
++	ret = bch2_fs_journal_start(&c->journal, journal_seq);
++	if (ret)
++		goto err;
++
++	/*
++	 * Skip past versions that might have possibly been used (as nonces),
++	 * but hadn't had their pointers written:
++	 */
++	if (c->sb.encryption_type && !c->sb.clean)
++		atomic64_add(1 << 16, &c->key_version);
++
++	ret = read_btree_roots(c);
++	if (ret)
++		goto err;
++
++	bch_verbose(c, "starting alloc read");
++	err = "error reading allocation information";
++
++	down_read(&c->gc_lock);
++	ret = bch2_alloc_read(c);
++	up_read(&c->gc_lock);
++
++	if (ret)
++		goto err;
++	bch_verbose(c, "alloc read done");
++
++	bch_verbose(c, "starting stripes_read");
++	err = "error reading stripes";
++	ret = bch2_stripes_read(c);
++	if (ret)
++		goto err;
++	bch_verbose(c, "stripes_read done");
++
++	bch2_stripes_heap_start(c);
++
++	if (c->opts.fsck) {
++		bool metadata_only = c->opts.norecovery;
++
++		bch_info(c, "checking allocations");
++		err = "error checking allocations";
++		ret = bch2_gc(c, true, metadata_only);
++		if (ret)
++			goto err;
++		bch_verbose(c, "done checking allocations");
++
++		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
++
++		bch_info(c, "checking need_discard and freespace btrees");
++		err = "error checking need_discard and freespace btrees";
++		ret = bch2_check_alloc_info(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "done checking need_discard and freespace btrees");
++
++		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
++
++		bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
++		err = "journal replay failed";
++		ret = bch2_journal_replay(c);
++		if (ret)
++			goto err;
++		if (c->opts.verbose || !c->sb.clean)
++			bch_info(c, "journal replay done");
++
++		bch_info(c, "checking lrus");
++		err = "error checking lrus";
++		ret = bch2_check_lrus(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "done checking lrus");
++		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
++
++		bch_info(c, "checking backpointers to alloc keys");
++		err = "error checking backpointers to alloc keys";
++		ret = bch2_check_btree_backpointers(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "done checking backpointers to alloc keys");
++
++		bch_info(c, "checking backpointers to extents");
++		err = "error checking backpointers to extents";
++		ret = bch2_check_backpointers_to_extents(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "done checking backpointers to extents");
++
++		bch_info(c, "checking extents to backpointers");
++		err = "error checking extents to backpointers";
++		ret = bch2_check_extents_to_backpointers(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "done checking extents to backpointers");
++		set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
++
++		bch_info(c, "checking alloc to lru refs");
++		err = "error checking alloc to lru refs";
++		ret = bch2_check_alloc_to_lru_refs(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "done checking alloc to lru refs");
++		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
++	} else {
++		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
++		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
++		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
++		set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
++		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
++		set_bit(BCH_FS_FSCK_DONE, &c->flags);
++
++		if (c->opts.norecovery)
++			goto out;
++
++		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
++		err = "journal replay failed";
++		ret = bch2_journal_replay(c);
++		if (ret)
++			goto err;
++		if (c->opts.verbose || !c->sb.clean)
++			bch_info(c, "journal replay done");
++	}
++
++	err = "error initializing freespace";
++	ret = bch2_fs_freespace_init(c);
++	if (ret)
++		goto err;
++
++	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
++		bch2_fs_lazy_rw(c);
++
++		err = "error creating root snapshot node";
++		ret = bch2_fs_initialize_subvolumes(c);
++		if (ret)
++			goto err;
++	}
++
++	bch_verbose(c, "reading snapshots table");
++	err = "error reading snapshots table";
++	ret = bch2_fs_snapshots_start(c);
++	if (ret)
++		goto err;
++	bch_verbose(c, "reading snapshots done");
++
++	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
++		/* set bi_subvol on root inode */
++		err = "error upgrade root inode for subvolumes";
++		ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
++				    bch2_fs_upgrade_for_subvolumes(&trans));
++		if (ret)
++			goto err;
++	}
++
++	if (c->opts.fsck) {
++		bch_info(c, "starting fsck");
++		err = "error in fsck";
++		ret = bch2_fsck_full(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "fsck done");
++	} else if (!c->sb.clean) {
++		bch_verbose(c, "checking for deleted inodes");
++		err = "error in recovery";
++		ret = bch2_fsck_walk_inodes_only(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "check inodes done");
++	}
++
++	if (enabled_qtypes(c)) {
++		bch_verbose(c, "reading quotas");
++		ret = bch2_fs_quota_read(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "quotas done");
++	}
++
++	mutex_lock(&c->sb_lock);
++	if (c->opts.version_upgrade) {
++		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
++		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
++		write_sb = true;
++	}
++
++	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
++		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
++		write_sb = true;
++	}
++
++	if (c->opts.fsck &&
++	    !test_bit(BCH_FS_ERROR, &c->flags) &&
++	    !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
++		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
++		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
++		write_sb = true;
++	}
++
++	if (write_sb)
++		bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
++	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
++	    le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
++		struct bch_move_stats stats;
++
++		bch_move_stats_init(&stats, "recovery");
++
++		bch_info(c, "scanning for old btree nodes");
++		ret = bch2_fs_read_write(c);
++		if (ret)
++			goto err;
++
++		ret = bch2_scan_old_btree_nodes(c, &stats);
++		if (ret)
++			goto err;
++		bch_info(c, "scanning for old btree nodes done");
++	}
++
++	if (c->journal_seq_blacklist_table &&
++	    c->journal_seq_blacklist_table->nr > 128)
++		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
++
++	ret = 0;
++out:
++	set_bit(BCH_FS_FSCK_DONE, &c->flags);
++	bch2_flush_fsck_errs(c);
++
++	if (!c->opts.keep_journal) {
++		bch2_journal_keys_free(&c->journal_keys);
++		bch2_journal_entries_free(c);
++	}
++	kfree(clean);
++
++	if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
++		bch2_fs_read_write_early(c);
++		bch2_delete_dead_snapshots_async(c);
++	}
++
++	if (ret)
++		bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret));
++	else
++		bch_verbose(c, "ret %s", bch2_err_str(ret));
++	return ret;
++err:
++fsck_err:
++	bch2_fs_emergency_read_only(c);
++	goto out;
++}
++
++int bch2_fs_initialize(struct bch_fs *c)
++{
++	struct bch_inode_unpacked root_inode, lostfound_inode;
++	struct bkey_inode_buf packed_inode;
++	struct qstr lostfound = QSTR("lost+found");
++	const char *err = "cannot allocate memory";
++	struct bch_dev *ca;
++	unsigned i;
++	int ret;
++
++	bch_notice(c, "initializing new filesystem");
++
++	mutex_lock(&c->sb_lock);
++	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
++	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
++
++	if (c->sb.version < bcachefs_metadata_version_backpointers)
++		c->opts.version_upgrade	= true;
++
++	if (c->opts.version_upgrade) {
++		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
++		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
++		bch2_write_super(c);
++	}
++	mutex_unlock(&c->sb_lock);
++
++	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
++	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
++	set_bit(BCH_FS_FSCK_DONE, &c->flags);
++
++	for (i = 0; i < BTREE_ID_NR; i++)
++		bch2_btree_root_alloc(c, i);
++
++	for_each_online_member(ca, c, i)
++		bch2_dev_usage_init(ca);
++
++	err = "unable to allocate journal buckets";
++	for_each_online_member(ca, c, i) {
++		ret = bch2_dev_journal_alloc(ca);
++		if (ret) {
++			percpu_ref_put(&ca->io_ref);
++			goto err;
++		}
++	}
++
++	/*
++	 * journal_res_get() will crash if called before this has
++	 * set up the journal.pin FIFO and journal.cur pointer:
++	 */
++	bch2_fs_journal_start(&c->journal, 1);
++	bch2_journal_set_replay_done(&c->journal);
++
++	err = "error going read-write";
++	ret = bch2_fs_read_write_early(c);
++	if (ret)
++		goto err;
++
++	/*
++	 * Write out the superblock and journal buckets, now that we can do
++	 * btree updates
++	 */
++	bch_verbose(c, "marking superblocks");
++	err = "error marking superblock and journal";
++	for_each_member_device(ca, c, i) {
++		ret = bch2_trans_mark_dev_sb(c, ca);
++		if (ret) {
++			percpu_ref_put(&ca->ref);
++			goto err;
++		}
++
++		ca->new_fs_bucket_idx = 0;
++	}
++
++	bch_verbose(c, "initializing freespace");
++	err = "error initializing freespace";
++	ret = bch2_fs_freespace_init(c);
++	if (ret)
++		goto err;
++
++	err = "error creating root snapshot node";
++	ret = bch2_fs_initialize_subvolumes(c);
++	if (ret)
++		goto err;
++
++	bch_verbose(c, "reading snapshots table");
++	err = "error reading snapshots table";
++	ret = bch2_fs_snapshots_start(c);
++	if (ret)
++		goto err;
++	bch_verbose(c, "reading snapshots done");
++
++	bch2_inode_init(c, &root_inode, 0, 0,
++			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
++	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
++	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
++	bch2_inode_pack(c, &packed_inode, &root_inode);
++	packed_inode.inode.k.p.snapshot = U32_MAX;
++
++	err = "error creating root directory";
++	ret = bch2_btree_insert(c, BTREE_ID_inodes,
++				&packed_inode.inode.k_i,
++				NULL, NULL, 0);
++	if (ret)
++		goto err;
++
++	bch2_inode_init_early(c, &lostfound_inode);
++
++	err = "error creating lost+found";
++	ret = bch2_trans_do(c, NULL, NULL, 0,
++		bch2_create_trans(&trans,
++				  BCACHEFS_ROOT_SUBVOL_INUM,
++				  &root_inode, &lostfound_inode,
++				  &lostfound,
++				  0, 0, S_IFDIR|0700, 0,
++				  NULL, NULL, (subvol_inum) { 0 }, 0));
++	if (ret) {
++		bch_err(c, "error creating lost+found");
++		goto err;
++	}
++
++	if (enabled_qtypes(c)) {
++		ret = bch2_fs_quota_read(c);
++		if (ret)
++			goto err;
++	}
++
++	err = "error writing first journal entry";
++	ret = bch2_journal_flush(&c->journal);
++	if (ret)
++		goto err;
++
++	mutex_lock(&c->sb_lock);
++	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
++	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
++
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	return 0;
++err:
++	pr_err("Error initializing new filesystem: %s (%i)", err, ret);
++	return ret;
++}
+diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
+new file mode 100644
+index 000000000000..8c0348e8b84c
+--- /dev/null
++++ b/fs/bcachefs/recovery.h
+@@ -0,0 +1,58 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_RECOVERY_H
++#define _BCACHEFS_RECOVERY_H
++
++struct journal_iter {
++	struct list_head	list;
++	enum btree_id		btree_id;
++	unsigned		level;
++	size_t			idx;
++	struct journal_keys	*keys;
++};
++
++/*
++ * Iterate over keys in the btree, with keys from the journal overlaid on top:
++ */
++
++struct btree_and_journal_iter {
++	struct btree		*b;
++	struct btree_node_iter	node_iter;
++	struct bkey		unpacked;
++
++	struct journal_iter	journal;
++	struct bpos		pos;
++	bool			at_end;
++};
++
++struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
++				unsigned, struct bpos, struct bpos, size_t *);
++struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
++					   unsigned, struct bpos);
++
++int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
++				 unsigned, struct bkey_i *);
++int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
++			    unsigned, struct bkey_i *);
++int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
++			    unsigned, struct bpos);
++void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
++				  unsigned, struct bpos);
++
++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
++
++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
++				struct bch_fs *, struct btree *,
++				struct btree_node_iter, struct bpos);
++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
++						struct bch_fs *,
++						struct btree *);
++
++void bch2_journal_keys_free(struct journal_keys *);
++void bch2_journal_entries_free(struct bch_fs *);
++
++int bch2_fs_recovery(struct bch_fs *);
++int bch2_fs_initialize(struct bch_fs *);
++
++#endif /* _BCACHEFS_RECOVERY_H */
+diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
+new file mode 100644
+index 000000000000..d5c14bb2992d
+--- /dev/null
++++ b/fs/bcachefs/reflink.c
+@@ -0,0 +1,422 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "bkey_buf.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "extents.h"
++#include "inode.h"
++#include "io.h"
++#include "reflink.h"
++#include "subvolume.h"
++
++#include <linux/sched/signal.h>
++
++static inline unsigned bkey_type_to_indirect(const struct bkey *k)
++{
++	switch (k->type) {
++	case KEY_TYPE_extent:
++		return KEY_TYPE_reflink_v;
++	case KEY_TYPE_inline_data:
++		return KEY_TYPE_indirect_inline_data;
++	default:
++		return 0;
++	}
++}
++
++/* reflink pointers */
++
++int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			   int rw, struct printbuf *err)
++{
++	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
++
++	if (bkey_val_bytes(p.k) != sizeof(*p.v)) {
++		prt_printf(err, "incorrect value size (%zu != %zu)",
++		       bkey_val_bytes(p.k), sizeof(*p.v));
++		return -EINVAL;
++	}
++
++	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
++	    le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
++		prt_printf(err, "idx < front_pad (%llu < %u)",
++		       le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
++			    struct bkey_s_c k)
++{
++	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
++
++	prt_printf(out, "idx %llu front_pad %u back_pad %u",
++	       le64_to_cpu(p.v->idx),
++	       le32_to_cpu(p.v->front_pad),
++	       le32_to_cpu(p.v->back_pad));
++}
++
++bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
++{
++	struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
++	struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
++
++	/*
++	 * Disabled for now, the triggers code needs to be reworked for merging
++	 * of reflink pointers to work:
++	 */
++	return false;
++
++	if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
++		return false;
++
++	bch2_key_resize(l.k, l.k->size + r.k->size);
++	return true;
++}
++
++/* indirect extents */
++
++int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			   int rw, struct printbuf *err)
++{
++	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
++
++	if (bkey_val_bytes(r.k) < sizeof(*r.v)) {
++		prt_printf(err, "incorrect value size (%zu < %zu)",
++		       bkey_val_bytes(r.k), sizeof(*r.v));
++		return -EINVAL;
++	}
++
++	return bch2_bkey_ptrs_invalid(c, k, rw, err);
++}
++
++void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
++			    struct bkey_s_c k)
++{
++	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
++
++	prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
++
++	bch2_bkey_ptrs_to_text(out, c, k);
++}
++
++bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
++{
++	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
++	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
++
++	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
++}
++
++int bch2_trans_mark_reflink_v(struct btree_trans *trans,
++			      enum btree_id btree_id, unsigned level,
++			      struct bkey_s_c old, struct bkey_i *new,
++			      unsigned flags)
++{
++	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
++		struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
++
++		if (!r->v.refcount) {
++			r->k.type = KEY_TYPE_deleted;
++			r->k.size = 0;
++			set_bkey_val_u64s(&r->k, 0);
++			return 0;
++		}
++	}
++
++	return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
++}
++
++/* indirect inline data */
++
++int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
++				      int rw, struct printbuf *err)
++{
++	if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
++		prt_printf(err, "incorrect value size (%zu < %zu)",
++		       bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_indirect_inline_data_to_text(struct printbuf *out,
++					struct bch_fs *c, struct bkey_s_c k)
++{
++	struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
++	unsigned datalen = bkey_inline_data_bytes(k.k);
++
++	prt_printf(out, "refcount %llu datalen %u: %*phN",
++	       le64_to_cpu(d.v->refcount), datalen,
++	       min(datalen, 32U), d.v->data);
++}
++
++int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
++			      enum btree_id btree_id, unsigned level,
++			      struct bkey_s_c old, struct bkey_i *new,
++			      unsigned flags)
++{
++	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
++		struct bkey_i_indirect_inline_data *r =
++			bkey_i_to_indirect_inline_data(new);
++
++		if (!r->v.refcount) {
++			r->k.type = KEY_TYPE_deleted;
++			r->k.size = 0;
++			set_bkey_val_u64s(&r->k, 0);
++		}
++	}
++
++	return 0;
++}
++
++static int bch2_make_extent_indirect(struct btree_trans *trans,
++				     struct btree_iter *extent_iter,
++				     struct bkey_i *orig)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter reflink_iter = { NULL };
++	struct bkey_s_c k;
++	struct bkey_i *r_v;
++	struct bkey_i_reflink_p *r_p;
++	__le64 *refcount;
++	int ret;
++
++	if (orig->k.type == KEY_TYPE_inline_data)
++		bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
++
++	for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
++			   POS(0, c->reflink_hint),
++			   BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
++		if (reflink_iter.pos.inode) {
++			bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
++			continue;
++		}
++
++		if (bkey_deleted(k.k) && orig->k.size <= k.k->size)
++			break;
++	}
++
++	if (ret)
++		goto err;
++
++	/* rewind iter to start of hole, if necessary: */
++	bch2_btree_iter_set_pos_to_extent_start(&reflink_iter);
++
++	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
++	ret = PTR_ERR_OR_ZERO(r_v);
++	if (ret)
++		goto err;
++
++	bkey_init(&r_v->k);
++	r_v->k.type	= bkey_type_to_indirect(&orig->k);
++	r_v->k.p	= reflink_iter.pos;
++	bch2_key_resize(&r_v->k, orig->k.size);
++	r_v->k.version	= orig->k.version;
++
++	set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
++
++	refcount	= bkey_refcount(r_v);
++	*refcount	= 0;
++	memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
++
++	ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
++	if (ret)
++		goto err;
++
++	/*
++	 * orig is in a bkey_buf which statically allocates 5 64s for the val,
++	 * so we know it will be big enough:
++	 */
++	orig->k.type = KEY_TYPE_reflink_p;
++	r_p = bkey_i_to_reflink_p(orig);
++	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
++	memset(&r_p->v, 0, sizeof(r_p->v));
++
++	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
++
++	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
++				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++err:
++	c->reflink_hint = reflink_iter.pos.offset;
++	bch2_trans_iter_exit(trans, &reflink_iter);
++
++	return ret;
++}
++
++static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
++{
++	struct bkey_s_c k;
++	int ret;
++
++	for_each_btree_key_continue_norestart(*iter, 0, k, ret) {
++		if (bkey_cmp(iter->pos, end) >= 0)
++			break;
++
++		if (bkey_extent_is_data(k.k))
++			return k;
++	}
++
++	if (bkey_cmp(iter->pos, end) >= 0)
++		bch2_btree_iter_set_pos(iter, end);
++	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
++}
++
++s64 bch2_remap_range(struct bch_fs *c,
++		     subvol_inum dst_inum, u64 dst_offset,
++		     subvol_inum src_inum, u64 src_offset,
++		     u64 remap_sectors,
++		     u64 new_i_size, s64 *i_sectors_delta)
++{
++	struct btree_trans trans;
++	struct btree_iter dst_iter, src_iter;
++	struct bkey_s_c src_k;
++	struct bkey_buf new_dst, new_src;
++	struct bpos dst_start = POS(dst_inum.inum, dst_offset);
++	struct bpos src_start = POS(src_inum.inum, src_offset);
++	struct bpos dst_end = dst_start, src_end = src_start;
++	struct bpos src_want;
++	u64 dst_done;
++	u32 dst_snapshot, src_snapshot;
++	int ret = 0, ret2 = 0;
++
++	if (!percpu_ref_tryget_live(&c->writes))
++		return -EROFS;
++
++	bch2_check_set_feature(c, BCH_FEATURE_reflink);
++
++	dst_end.offset += remap_sectors;
++	src_end.offset += remap_sectors;
++
++	bch2_bkey_buf_init(&new_dst);
++	bch2_bkey_buf_init(&new_src);
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
++
++	bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
++			     BTREE_ITER_INTENT);
++	bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
++			     BTREE_ITER_INTENT);
++
++	while ((ret == 0 ||
++		bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
++	       bkey_cmp(dst_iter.pos, dst_end) < 0) {
++		struct disk_reservation disk_res = { 0 };
++
++		bch2_trans_begin(&trans);
++
++		if (fatal_signal_pending(current)) {
++			ret = -EINTR;
++			break;
++		}
++
++		ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
++						  &src_snapshot);
++		if (ret)
++			continue;
++
++		bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
++
++		ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
++						  &dst_snapshot);
++		if (ret)
++			continue;
++
++		bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
++
++		dst_done = dst_iter.pos.offset - dst_start.offset;
++		src_want = POS(src_start.inode, src_start.offset + dst_done);
++		bch2_btree_iter_set_pos(&src_iter, src_want);
++
++		src_k = get_next_src(&src_iter, src_end);
++		ret = bkey_err(src_k);
++		if (ret)
++			continue;
++
++		if (bkey_cmp(src_want, src_iter.pos) < 0) {
++			ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
++					min(dst_end.offset,
++					    dst_iter.pos.offset +
++					    src_iter.pos.offset - src_want.offset),
++					i_sectors_delta);
++			continue;
++		}
++
++		if (src_k.k->type != KEY_TYPE_reflink_p) {
++			bch2_btree_iter_set_pos_to_extent_start(&src_iter);
++
++			bch2_bkey_buf_reassemble(&new_src, c, src_k);
++			src_k = bkey_i_to_s_c(new_src.k);
++
++			ret = bch2_make_extent_indirect(&trans, &src_iter,
++						new_src.k);
++			if (ret)
++				continue;
++
++			BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
++		}
++
++		if (src_k.k->type == KEY_TYPE_reflink_p) {
++			struct bkey_s_c_reflink_p src_p =
++				bkey_s_c_to_reflink_p(src_k);
++			struct bkey_i_reflink_p *dst_p =
++				bkey_reflink_p_init(new_dst.k);
++
++			u64 offset = le64_to_cpu(src_p.v->idx) +
++				(src_want.offset -
++				 bkey_start_offset(src_k.k));
++
++			dst_p->v.idx = cpu_to_le64(offset);
++		} else {
++			BUG();
++		}
++
++		new_dst.k->k.p = dst_iter.pos;
++		bch2_key_resize(&new_dst.k->k,
++				min(src_k.k->p.offset - src_want.offset,
++				    dst_end.offset - dst_iter.pos.offset));
++
++		ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
++					 new_dst.k, &disk_res, NULL,
++					 new_i_size, i_sectors_delta,
++					 true);
++		bch2_disk_reservation_put(c, &disk_res);
++	}
++	bch2_trans_iter_exit(&trans, &dst_iter);
++	bch2_trans_iter_exit(&trans, &src_iter);
++
++	BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end));
++	BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0);
++
++	dst_done = dst_iter.pos.offset - dst_start.offset;
++	new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
++
++	do {
++		struct bch_inode_unpacked inode_u;
++		struct btree_iter inode_iter = { NULL };
++
++		bch2_trans_begin(&trans);
++
++		ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
++				       dst_inum, BTREE_ITER_INTENT);
++
++		if (!ret2 &&
++		    inode_u.bi_size < new_i_size) {
++			inode_u.bi_size = new_i_size;
++			ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
++				bch2_trans_commit(&trans, NULL, NULL,
++						  BTREE_INSERT_NOFAIL);
++		}
++
++		bch2_trans_iter_exit(&trans, &inode_iter);
++	} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
++
++	bch2_trans_exit(&trans);
++	bch2_bkey_buf_exit(&new_src, c);
++	bch2_bkey_buf_exit(&new_dst, c);
++
++	percpu_ref_put(&c->writes);
++
++	return dst_done ?: ret ?: ret2;
++}
+diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
+new file mode 100644
+index 000000000000..f9848dc3eebb
+--- /dev/null
++++ b/fs/bcachefs/reflink.h
+@@ -0,0 +1,76 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_REFLINK_H
++#define _BCACHEFS_REFLINK_H
++
++int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
++			   int, struct printbuf *);
++void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
++			    struct bkey_s_c);
++bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
++
++#define bch2_bkey_ops_reflink_p (struct bkey_ops) {		\
++	.key_invalid	= bch2_reflink_p_invalid,		\
++	.val_to_text	= bch2_reflink_p_to_text,		\
++	.key_merge	= bch2_reflink_p_merge,			\
++	.trans_trigger	= bch2_trans_mark_reflink_p,		\
++	.atomic_trigger	= bch2_mark_reflink_p,			\
++}
++
++int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
++			   int, struct printbuf *);
++void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
++			    struct bkey_s_c);
++int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
++			      struct bkey_s_c, struct bkey_i *, unsigned);
++
++#define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
++	.key_invalid	= bch2_reflink_v_invalid,		\
++	.val_to_text	= bch2_reflink_v_to_text,		\
++	.swab		= bch2_ptr_swab,			\
++	.trans_trigger	= bch2_trans_mark_reflink_v,		\
++	.atomic_trigger	= bch2_mark_extent,			\
++}
++
++int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
++				      int, struct printbuf *);
++void bch2_indirect_inline_data_to_text(struct printbuf *,
++				struct bch_fs *, struct bkey_s_c);
++int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
++					 enum btree_id, unsigned,
++			      struct bkey_s_c, struct bkey_i *,
++			      unsigned);
++
++#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) {	\
++	.key_invalid	= bch2_indirect_inline_data_invalid,	\
++	.val_to_text	= bch2_indirect_inline_data_to_text,	\
++	.trans_trigger	= bch2_trans_mark_indirect_inline_data,	\
++}
++
++static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
++{
++	switch (k.k->type) {
++	case KEY_TYPE_reflink_v:
++		return &bkey_s_c_to_reflink_v(k).v->refcount;
++	case KEY_TYPE_indirect_inline_data:
++		return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
++	default:
++		return NULL;
++	}
++}
++
++static inline __le64 *bkey_refcount(struct bkey_i *k)
++{
++	switch (k->k.type) {
++	case KEY_TYPE_reflink_v:
++		return &bkey_i_to_reflink_v(k)->v.refcount;
++	case KEY_TYPE_indirect_inline_data:
++		return &bkey_i_to_indirect_inline_data(k)->v.refcount;
++	default:
++		return NULL;
++	}
++}
++
++s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
++		     subvol_inum, u64, u64, u64, s64 *);
++
++#endif /* _BCACHEFS_REFLINK_H */
+diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
+new file mode 100644
+index 000000000000..9cb47ba62bc3
+--- /dev/null
++++ b/fs/bcachefs/replicas.c
+@@ -0,0 +1,1073 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "buckets.h"
++#include "journal.h"
++#include "replicas.h"
++#include "super-io.h"
++
++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
++					    struct bch_replicas_cpu *);
++
++/* Replicas tracking - in memory: */
++
++static void verify_replicas_entry(struct bch_replicas_entry *e)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	unsigned i;
++
++	BUG_ON(e->data_type >= BCH_DATA_NR);
++	BUG_ON(!e->nr_devs);
++	BUG_ON(e->nr_required > 1 &&
++	       e->nr_required >= e->nr_devs);
++
++	for (i = 0; i + 1 < e->nr_devs; i++)
++		BUG_ON(e->devs[i] >= e->devs[i + 1]);
++#endif
++}
++
++void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
++{
++	bubble_sort(e->devs, e->nr_devs, u8_cmp);
++}
++
++static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
++{
++	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
++}
++
++void bch2_replicas_entry_v0_to_text(struct printbuf *out,
++				    struct bch_replicas_entry_v0 *e)
++{
++	unsigned i;
++
++	if (e->data_type < BCH_DATA_NR)
++		prt_printf(out, "%s", bch2_data_types[e->data_type]);
++	else
++		prt_printf(out, "(invalid data type %u)", e->data_type);
++
++	prt_printf(out, ": %u [", e->nr_devs);
++	for (i = 0; i < e->nr_devs; i++)
++		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
++	prt_printf(out, "]");
++}
++
++void bch2_replicas_entry_to_text(struct printbuf *out,
++				 struct bch_replicas_entry *e)
++{
++	unsigned i;
++
++	if (e->data_type < BCH_DATA_NR)
++		prt_printf(out, "%s", bch2_data_types[e->data_type]);
++	else
++		prt_printf(out, "(invalid data type %u)", e->data_type);
++
++	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
++	for (i = 0; i < e->nr_devs; i++)
++		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
++	prt_printf(out, "]");
++}
++
++void bch2_cpu_replicas_to_text(struct printbuf *out,
++			       struct bch_replicas_cpu *r)
++{
++	struct bch_replicas_entry *e;
++	bool first = true;
++
++	for_each_cpu_replicas_entry(r, e) {
++		if (!first)
++			prt_printf(out, " ");
++		first = false;
++
++		bch2_replicas_entry_to_text(out, e);
++	}
++}
++
++static void extent_to_replicas(struct bkey_s_c k,
++			       struct bch_replicas_entry *r)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const union bch_extent_entry *entry;
++	struct extent_ptr_decoded p;
++
++	r->nr_required	= 1;
++
++	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++		if (p.ptr.cached)
++			continue;
++
++		if (!p.has_ec)
++			r->devs[r->nr_devs++] = p.ptr.dev;
++		else
++			r->nr_required = 0;
++	}
++}
++
++static void stripe_to_replicas(struct bkey_s_c k,
++			       struct bch_replicas_entry *r)
++{
++	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
++	const struct bch_extent_ptr *ptr;
++
++	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
++
++	for (ptr = s.v->ptrs;
++	     ptr < s.v->ptrs + s.v->nr_blocks;
++	     ptr++)
++		r->devs[r->nr_devs++] = ptr->dev;
++}
++
++void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
++			   struct bkey_s_c k)
++{
++	e->nr_devs = 0;
++
++	switch (k.k->type) {
++	case KEY_TYPE_btree_ptr:
++	case KEY_TYPE_btree_ptr_v2:
++		e->data_type = BCH_DATA_btree;
++		extent_to_replicas(k, e);
++		break;
++	case KEY_TYPE_extent:
++	case KEY_TYPE_reflink_v:
++		e->data_type = BCH_DATA_user;
++		extent_to_replicas(k, e);
++		break;
++	case KEY_TYPE_stripe:
++		e->data_type = BCH_DATA_parity;
++		stripe_to_replicas(k, e);
++		break;
++	}
++
++	bch2_replicas_entry_sort(e);
++}
++
++void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
++			      enum bch_data_type data_type,
++			      struct bch_devs_list devs)
++{
++	unsigned i;
++
++	BUG_ON(!data_type ||
++	       data_type == BCH_DATA_sb ||
++	       data_type >= BCH_DATA_NR);
++
++	e->data_type	= data_type;
++	e->nr_devs	= 0;
++	e->nr_required	= 1;
++
++	for (i = 0; i < devs.nr; i++)
++		e->devs[e->nr_devs++] = devs.devs[i];
++
++	bch2_replicas_entry_sort(e);
++}
++
++static struct bch_replicas_cpu
++cpu_replicas_add_entry(struct bch_replicas_cpu *old,
++		       struct bch_replicas_entry *new_entry)
++{
++	unsigned i;
++	struct bch_replicas_cpu new = {
++		.nr		= old->nr + 1,
++		.entry_size	= max_t(unsigned, old->entry_size,
++					replicas_entry_bytes(new_entry)),
++	};
++
++	BUG_ON(!new_entry->data_type);
++	verify_replicas_entry(new_entry);
++
++	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
++	if (!new.entries)
++		return new;
++
++	for (i = 0; i < old->nr; i++)
++		memcpy(cpu_replicas_entry(&new, i),
++		       cpu_replicas_entry(old, i),
++		       old->entry_size);
++
++	memcpy(cpu_replicas_entry(&new, old->nr),
++	       new_entry,
++	       replicas_entry_bytes(new_entry));
++
++	bch2_cpu_replicas_sort(&new);
++	return new;
++}
++
++static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
++				       struct bch_replicas_entry *search)
++{
++	int idx, entry_size = replicas_entry_bytes(search);
++
++	if (unlikely(entry_size > r->entry_size))
++		return -1;
++
++	verify_replicas_entry(search);
++
++#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
++	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
++			      entry_cmp, search);
++#undef entry_cmp
++
++	return idx < r->nr ? idx : -1;
++}
++
++int bch2_replicas_entry_idx(struct bch_fs *c,
++			    struct bch_replicas_entry *search)
++{
++	bch2_replicas_entry_sort(search);
++
++	return __replicas_entry_idx(&c->replicas, search);
++}
++
++static bool __replicas_has_entry(struct bch_replicas_cpu *r,
++				 struct bch_replicas_entry *search)
++{
++	return __replicas_entry_idx(r, search) >= 0;
++}
++
++bool bch2_replicas_marked(struct bch_fs *c,
++			  struct bch_replicas_entry *search)
++{
++	bool marked;
++
++	if (!search->nr_devs)
++		return true;
++
++	verify_replicas_entry(search);
++
++	percpu_down_read(&c->mark_lock);
++	marked = __replicas_has_entry(&c->replicas, search) &&
++		(likely((!c->replicas_gc.entries)) ||
++		 __replicas_has_entry(&c->replicas_gc, search));
++	percpu_up_read(&c->mark_lock);
++
++	return marked;
++}
++
++static void __replicas_table_update(struct bch_fs_usage *dst,
++				    struct bch_replicas_cpu *dst_r,
++				    struct bch_fs_usage *src,
++				    struct bch_replicas_cpu *src_r)
++{
++	int src_idx, dst_idx;
++
++	*dst = *src;
++
++	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
++		if (!src->replicas[src_idx])
++			continue;
++
++		dst_idx = __replicas_entry_idx(dst_r,
++				cpu_replicas_entry(src_r, src_idx));
++		BUG_ON(dst_idx < 0);
++
++		dst->replicas[dst_idx] = src->replicas[src_idx];
++	}
++}
++
++static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
++				    struct bch_replicas_cpu *dst_r,
++				    struct bch_fs_usage __percpu *src_p,
++				    struct bch_replicas_cpu *src_r)
++{
++	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
++	struct bch_fs_usage *dst, *src = (void *)
++		bch2_acc_percpu_u64s((void *) src_p, src_nr);
++
++	preempt_disable();
++	dst = this_cpu_ptr(dst_p);
++	preempt_enable();
++
++	__replicas_table_update(dst, dst_r, src, src_r);
++}
++
++/*
++ * Resize filesystem accounting:
++ */
++static int replicas_table_update(struct bch_fs *c,
++				 struct bch_replicas_cpu *new_r)
++{
++	struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
++	struct bch_fs_usage_online *new_scratch = NULL;
++	struct bch_fs_usage __percpu *new_gc = NULL;
++	struct bch_fs_usage *new_base = NULL;
++	unsigned i, bytes = sizeof(struct bch_fs_usage) +
++		sizeof(u64) * new_r->nr;
++	unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
++		sizeof(u64) * new_r->nr;
++	int ret = 0;
++
++	memset(new_usage, 0, sizeof(new_usage));
++
++	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
++		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
++					sizeof(u64), GFP_KERNEL)))
++			goto err;
++
++	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
++	    !(new_scratch  = kmalloc(scratch_bytes, GFP_KERNEL)) ||
++	    (c->usage_gc &&
++	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
++		goto err;
++
++	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
++		if (c->usage[i])
++			__replicas_table_update_pcpu(new_usage[i], new_r,
++						     c->usage[i], &c->replicas);
++	if (c->usage_base)
++		__replicas_table_update(new_base,		new_r,
++					c->usage_base,		&c->replicas);
++	if (c->usage_gc)
++		__replicas_table_update_pcpu(new_gc,		new_r,
++					     c->usage_gc,	&c->replicas);
++
++	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
++		swap(c->usage[i],	new_usage[i]);
++	swap(c->usage_base,	new_base);
++	swap(c->usage_scratch,	new_scratch);
++	swap(c->usage_gc,	new_gc);
++	swap(c->replicas,	*new_r);
++out:
++	free_percpu(new_gc);
++	kfree(new_scratch);
++	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
++		free_percpu(new_usage[i]);
++	kfree(new_base);
++	return ret;
++err:
++	bch_err(c, "error updating replicas table: memory allocation failure");
++	ret = -ENOMEM;
++	goto out;
++}
++
++static unsigned reserve_journal_replicas(struct bch_fs *c,
++				     struct bch_replicas_cpu *r)
++{
++	struct bch_replicas_entry *e;
++	unsigned journal_res_u64s = 0;
++
++	/* nr_inodes: */
++	journal_res_u64s +=
++		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
++
++	/* key_version: */
++	journal_res_u64s +=
++		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
++
++	/* persistent_reserved: */
++	journal_res_u64s +=
++		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
++		BCH_REPLICAS_MAX;
++
++	for_each_cpu_replicas_entry(r, e)
++		journal_res_u64s +=
++			DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
++				     e->nr_devs, sizeof(u64));
++	return journal_res_u64s;
++}
++
++noinline
++static int bch2_mark_replicas_slowpath(struct bch_fs *c,
++				struct bch_replicas_entry *new_entry)
++{
++	struct bch_replicas_cpu new_r, new_gc;
++	int ret = 0;
++
++	verify_replicas_entry(new_entry);
++
++	memset(&new_r, 0, sizeof(new_r));
++	memset(&new_gc, 0, sizeof(new_gc));
++
++	mutex_lock(&c->sb_lock);
++
++	if (c->replicas_gc.entries &&
++	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
++		new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
++		if (!new_gc.entries)
++			goto err;
++	}
++
++	if (!__replicas_has_entry(&c->replicas, new_entry)) {
++		new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
++		if (!new_r.entries)
++			goto err;
++
++		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
++		if (ret)
++			goto err;
++
++		bch2_journal_entry_res_resize(&c->journal,
++				&c->replicas_journal_res,
++				reserve_journal_replicas(c, &new_r));
++	}
++
++	if (!new_r.entries &&
++	    !new_gc.entries)
++		goto out;
++
++	/* allocations done, now commit: */
++
++	if (new_r.entries)
++		bch2_write_super(c);
++
++	/* don't update in memory replicas until changes are persistent */
++	percpu_down_write(&c->mark_lock);
++	if (new_r.entries)
++		ret = replicas_table_update(c, &new_r);
++	if (new_gc.entries)
++		swap(new_gc, c->replicas_gc);
++	percpu_up_write(&c->mark_lock);
++out:
++	mutex_unlock(&c->sb_lock);
++
++	kfree(new_r.entries);
++	kfree(new_gc.entries);
++
++	return ret;
++err:
++	bch_err(c, "error adding replicas entry: memory allocation failure");
++	ret = -ENOMEM;
++	goto out;
++}
++
++int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
++{
++	return likely(bch2_replicas_marked(c, r))
++		? 0 : bch2_mark_replicas_slowpath(c, r);
++}
++
++/* replicas delta list: */
++
++int bch2_replicas_delta_list_mark(struct bch_fs *c,
++				  struct replicas_delta_list *r)
++{
++	struct replicas_delta *d = r->d;
++	struct replicas_delta *top = (void *) r->d + r->used;
++	int ret = 0;
++
++	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
++		ret = bch2_mark_replicas(c, &d->r);
++	return ret;
++}
++
++/*
++ * Old replicas_gc mechanism: only used for journal replicas entries now, should
++ * die at some point:
++ */
++
++int bch2_replicas_gc_end(struct bch_fs *c, int ret)
++{
++	unsigned i;
++
++	lockdep_assert_held(&c->replicas_gc_lock);
++
++	mutex_lock(&c->sb_lock);
++	percpu_down_write(&c->mark_lock);
++
++	/*
++	 * this is kind of crappy; the replicas gc mechanism needs to be ripped
++	 * out
++	 */
++
++	for (i = 0; i < c->replicas.nr; i++) {
++		struct bch_replicas_entry *e =
++			cpu_replicas_entry(&c->replicas, i);
++		struct bch_replicas_cpu n;
++
++		if (!__replicas_has_entry(&c->replicas_gc, e) &&
++		    bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
++			n = cpu_replicas_add_entry(&c->replicas_gc, e);
++			if (!n.entries) {
++				ret = -ENOSPC;
++				goto err;
++			}
++
++			swap(n, c->replicas_gc);
++			kfree(n.entries);
++		}
++	}
++
++	if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
++		ret = -ENOSPC;
++		goto err;
++	}
++
++	ret = replicas_table_update(c, &c->replicas_gc);
++err:
++	kfree(c->replicas_gc.entries);
++	c->replicas_gc.entries = NULL;
++
++	percpu_up_write(&c->mark_lock);
++
++	if (!ret)
++		bch2_write_super(c);
++
++	mutex_unlock(&c->sb_lock);
++
++	return ret;
++}
++
++int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
++{
++	struct bch_replicas_entry *e;
++	unsigned i = 0;
++
++	lockdep_assert_held(&c->replicas_gc_lock);
++
++	mutex_lock(&c->sb_lock);
++	BUG_ON(c->replicas_gc.entries);
++
++	c->replicas_gc.nr		= 0;
++	c->replicas_gc.entry_size	= 0;
++
++	for_each_cpu_replicas_entry(&c->replicas, e)
++		if (!((1 << e->data_type) & typemask)) {
++			c->replicas_gc.nr++;
++			c->replicas_gc.entry_size =
++				max_t(unsigned, c->replicas_gc.entry_size,
++				      replicas_entry_bytes(e));
++		}
++
++	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
++					 c->replicas_gc.entry_size,
++					 GFP_KERNEL);
++	if (!c->replicas_gc.entries) {
++		mutex_unlock(&c->sb_lock);
++		bch_err(c, "error allocating c->replicas_gc");
++		return -ENOMEM;
++	}
++
++	for_each_cpu_replicas_entry(&c->replicas, e)
++		if (!((1 << e->data_type) & typemask))
++			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
++			       e, c->replicas_gc.entry_size);
++
++	bch2_cpu_replicas_sort(&c->replicas_gc);
++	mutex_unlock(&c->sb_lock);
++
++	return 0;
++}
++
++/* New much simpler mechanism for clearing out unneeded replicas entries: */
++
++int bch2_replicas_gc2(struct bch_fs *c)
++{
++	struct bch_replicas_cpu new = { 0 };
++	unsigned i, nr;
++	int ret = 0;
++
++	bch2_journal_meta(&c->journal);
++retry:
++	nr		= READ_ONCE(c->replicas.nr);
++	new.entry_size	= READ_ONCE(c->replicas.entry_size);
++	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
++	if (!new.entries) {
++		bch_err(c, "error allocating c->replicas_gc");
++		return -ENOMEM;
++	}
++
++	mutex_lock(&c->sb_lock);
++	percpu_down_write(&c->mark_lock);
++
++	if (nr			!= c->replicas.nr ||
++	    new.entry_size	!= c->replicas.entry_size) {
++		percpu_up_write(&c->mark_lock);
++		mutex_unlock(&c->sb_lock);
++		kfree(new.entries);
++		goto retry;
++	}
++
++	for (i = 0; i < c->replicas.nr; i++) {
++		struct bch_replicas_entry *e =
++			cpu_replicas_entry(&c->replicas, i);
++
++		if (e->data_type == BCH_DATA_journal ||
++		    c->usage_base->replicas[i] ||
++		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
++		    percpu_u64_get(&c->usage[1]->replicas[i]) ||
++		    percpu_u64_get(&c->usage[2]->replicas[i]) ||
++		    percpu_u64_get(&c->usage[3]->replicas[i]))
++			memcpy(cpu_replicas_entry(&new, new.nr++),
++			       e, new.entry_size);
++	}
++
++	bch2_cpu_replicas_sort(&new);
++
++	if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
++		ret = -ENOSPC;
++		goto err;
++	}
++
++	ret = replicas_table_update(c, &new);
++err:
++	kfree(new.entries);
++
++	percpu_up_write(&c->mark_lock);
++
++	if (!ret)
++		bch2_write_super(c);
++
++	mutex_unlock(&c->sb_lock);
++
++	return ret;
++}
++
++int bch2_replicas_set_usage(struct bch_fs *c,
++			    struct bch_replicas_entry *r,
++			    u64 sectors)
++{
++	int ret, idx = bch2_replicas_entry_idx(c, r);
++
++	if (idx < 0) {
++		struct bch_replicas_cpu n;
++
++		n = cpu_replicas_add_entry(&c->replicas, r);
++		if (!n.entries)
++			return -ENOMEM;
++
++		ret = replicas_table_update(c, &n);
++		if (ret)
++			return ret;
++
++		kfree(n.entries);
++
++		idx = bch2_replicas_entry_idx(c, r);
++		BUG_ON(ret < 0);
++	}
++
++	c->usage_base->replicas[idx] = sectors;
++
++	return 0;
++}
++
++/* Replicas tracking - superblock: */
++
++static int
++__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
++				   struct bch_replicas_cpu *cpu_r)
++{
++	struct bch_replicas_entry *e, *dst;
++	unsigned nr = 0, entry_size = 0, idx = 0;
++
++	for_each_replicas_entry(sb_r, e) {
++		entry_size = max_t(unsigned, entry_size,
++				   replicas_entry_bytes(e));
++		nr++;
++	}
++
++	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
++	if (!cpu_r->entries)
++		return -ENOMEM;
++
++	cpu_r->nr		= nr;
++	cpu_r->entry_size	= entry_size;
++
++	for_each_replicas_entry(sb_r, e) {
++		dst = cpu_replicas_entry(cpu_r, idx++);
++		memcpy(dst, e, replicas_entry_bytes(e));
++		bch2_replicas_entry_sort(dst);
++	}
++
++	return 0;
++}
++
++static int
++__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
++				      struct bch_replicas_cpu *cpu_r)
++{
++	struct bch_replicas_entry_v0 *e;
++	unsigned nr = 0, entry_size = 0, idx = 0;
++
++	for_each_replicas_entry(sb_r, e) {
++		entry_size = max_t(unsigned, entry_size,
++				   replicas_entry_bytes(e));
++		nr++;
++	}
++
++	entry_size += sizeof(struct bch_replicas_entry) -
++		sizeof(struct bch_replicas_entry_v0);
++
++	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
++	if (!cpu_r->entries)
++		return -ENOMEM;
++
++	cpu_r->nr		= nr;
++	cpu_r->entry_size	= entry_size;
++
++	for_each_replicas_entry(sb_r, e) {
++		struct bch_replicas_entry *dst =
++			cpu_replicas_entry(cpu_r, idx++);
++
++		dst->data_type	= e->data_type;
++		dst->nr_devs	= e->nr_devs;
++		dst->nr_required = 1;
++		memcpy(dst->devs, e->devs, e->nr_devs);
++		bch2_replicas_entry_sort(dst);
++	}
++
++	return 0;
++}
++
++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
++{
++	struct bch_sb_field_replicas *sb_v1;
++	struct bch_sb_field_replicas_v0 *sb_v0;
++	struct bch_replicas_cpu new_r = { 0, 0, NULL };
++	int ret = 0;
++
++	if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
++		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
++	else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
++		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
++
++	if (ret)
++		return -ENOMEM;
++
++	bch2_cpu_replicas_sort(&new_r);
++
++	percpu_down_write(&c->mark_lock);
++
++	ret = replicas_table_update(c, &new_r);
++	percpu_up_write(&c->mark_lock);
++
++	kfree(new_r.entries);
++
++	return 0;
++}
++
++static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
++					       struct bch_replicas_cpu *r)
++{
++	struct bch_sb_field_replicas_v0 *sb_r;
++	struct bch_replicas_entry_v0 *dst;
++	struct bch_replicas_entry *src;
++	size_t bytes;
++
++	bytes = sizeof(struct bch_sb_field_replicas);
++
++	for_each_cpu_replicas_entry(r, src)
++		bytes += replicas_entry_bytes(src) - 1;
++
++	sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
++			DIV_ROUND_UP(bytes, sizeof(u64)));
++	if (!sb_r)
++		return -ENOSPC;
++
++	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
++	sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
++
++	memset(&sb_r->entries, 0,
++	       vstruct_end(&sb_r->field) -
++	       (void *) &sb_r->entries);
++
++	dst = sb_r->entries;
++	for_each_cpu_replicas_entry(r, src) {
++		dst->data_type	= src->data_type;
++		dst->nr_devs	= src->nr_devs;
++		memcpy(dst->devs, src->devs, src->nr_devs);
++
++		dst = replicas_entry_next(dst);
++
++		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
++	}
++
++	return 0;
++}
++
++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
++					    struct bch_replicas_cpu *r)
++{
++	struct bch_sb_field_replicas *sb_r;
++	struct bch_replicas_entry *dst, *src;
++	bool need_v1 = false;
++	size_t bytes;
++
++	bytes = sizeof(struct bch_sb_field_replicas);
++
++	for_each_cpu_replicas_entry(r, src) {
++		bytes += replicas_entry_bytes(src);
++		if (src->nr_required != 1)
++			need_v1 = true;
++	}
++
++	if (!need_v1)
++		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
++
++	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
++			DIV_ROUND_UP(bytes, sizeof(u64)));
++	if (!sb_r)
++		return -ENOSPC;
++
++	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
++	sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
++
++	memset(&sb_r->entries, 0,
++	       vstruct_end(&sb_r->field) -
++	       (void *) &sb_r->entries);
++
++	dst = sb_r->entries;
++	for_each_cpu_replicas_entry(r, src) {
++		memcpy(dst, src, replicas_entry_bytes(src));
++
++		dst = replicas_entry_next(dst);
++
++		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
++	}
++
++	return 0;
++}
++
++static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
++				      struct bch_sb *sb,
++				      struct printbuf *err)
++{
++	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
++	unsigned i, j;
++
++	sort_cmp_size(cpu_r->entries,
++		      cpu_r->nr,
++		      cpu_r->entry_size,
++		      memcmp, NULL);
++
++	for (i = 0; i < cpu_r->nr; i++) {
++		struct bch_replicas_entry *e =
++			cpu_replicas_entry(cpu_r, i);
++
++		if (e->data_type >= BCH_DATA_NR) {
++			prt_printf(err, "invalid data type in entry ");
++			bch2_replicas_entry_to_text(err, e);
++			return -EINVAL;
++		}
++
++		if (!e->nr_devs) {
++			prt_printf(err, "no devices in entry ");
++			bch2_replicas_entry_to_text(err, e);
++			return -EINVAL;
++		}
++
++		if (e->nr_required > 1 &&
++		    e->nr_required >= e->nr_devs) {
++			prt_printf(err, "bad nr_required in entry ");
++			bch2_replicas_entry_to_text(err, e);
++			return -EINVAL;
++		}
++
++		for (j = 0; j < e->nr_devs; j++)
++			if (!bch2_dev_exists(sb, mi, e->devs[j])) {
++				prt_printf(err, "invalid device %u in entry ", e->devs[j]);
++				bch2_replicas_entry_to_text(err, e);
++				return -EINVAL;
++			}
++
++		if (i + 1 < cpu_r->nr) {
++			struct bch_replicas_entry *n =
++				cpu_replicas_entry(cpu_r, i + 1);
++
++			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
++
++			if (!memcmp(e, n, cpu_r->entry_size)) {
++				prt_printf(err, "duplicate replicas entry ");
++				bch2_replicas_entry_to_text(err, e);
++				return -EINVAL;
++			}
++		}
++	}
++
++	return 0;
++}
++
++static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
++				     struct printbuf *err)
++{
++	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
++	struct bch_replicas_cpu cpu_r;
++	int ret;
++
++	if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
++		return -ENOMEM;
++
++	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
++	kfree(cpu_r.entries);
++	return ret;
++}
++
++static void bch2_sb_replicas_to_text(struct printbuf *out,
++				     struct bch_sb *sb,
++				     struct bch_sb_field *f)
++{
++	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
++	struct bch_replicas_entry *e;
++	bool first = true;
++
++	for_each_replicas_entry(r, e) {
++		if (!first)
++			prt_printf(out, " ");
++		first = false;
++
++		bch2_replicas_entry_to_text(out, e);
++	}
++	prt_newline(out);
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
++	.validate	= bch2_sb_replicas_validate,
++	.to_text	= bch2_sb_replicas_to_text,
++};
++
++static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
++					struct printbuf *err)
++{
++	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
++	struct bch_replicas_cpu cpu_r;
++	int ret;
++
++	if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
++		return -ENOMEM;
++
++	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
++	kfree(cpu_r.entries);
++	return ret;
++}
++
++static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
++					struct bch_sb *sb,
++					struct bch_sb_field *f)
++{
++	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
++	struct bch_replicas_entry_v0 *e;
++	bool first = true;
++
++	for_each_replicas_entry(sb_r, e) {
++		if (!first)
++			prt_printf(out, " ");
++		first = false;
++
++		bch2_replicas_entry_v0_to_text(out, e);
++	}
++	prt_newline(out);
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
++	.validate	= bch2_sb_replicas_v0_validate,
++	.to_text	= bch2_sb_replicas_v0_to_text,
++};
++
++/* Query replicas: */
++
++bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
++			   unsigned flags, bool print)
++{
++	struct bch_replicas_entry *e;
++	bool ret = true;
++
++	percpu_down_read(&c->mark_lock);
++	for_each_cpu_replicas_entry(&c->replicas, e) {
++		unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
++		bool metadata = e->data_type < BCH_DATA_user;
++
++		if (e->data_type == BCH_DATA_cached)
++			continue;
++
++		for (i = 0; i < e->nr_devs; i++) {
++			struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
++
++			nr_online += test_bit(e->devs[i], devs.d);
++			nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
++		}
++
++		if (nr_failed == e->nr_devs)
++			continue;
++
++		if (nr_online < e->nr_required)
++			dflags |= metadata
++				? BCH_FORCE_IF_METADATA_LOST
++				: BCH_FORCE_IF_DATA_LOST;
++
++		if (nr_online < e->nr_devs)
++			dflags |= metadata
++				? BCH_FORCE_IF_METADATA_DEGRADED
++				: BCH_FORCE_IF_DATA_DEGRADED;
++
++		if (dflags & ~flags) {
++			if (print) {
++				struct printbuf buf = PRINTBUF;
++
++				bch2_replicas_entry_to_text(&buf, e);
++				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
++					nr_online, buf.buf);
++				printbuf_exit(&buf);
++			}
++			ret = false;
++			break;
++		}
++
++	}
++	percpu_up_read(&c->mark_lock);
++
++	return ret;
++}
++
++unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
++{
++	struct bch_sb_field_replicas *replicas;
++	struct bch_sb_field_replicas_v0 *replicas_v0;
++	unsigned i, data_has = 0;
++
++	replicas = bch2_sb_get_replicas(sb);
++	replicas_v0 = bch2_sb_get_replicas_v0(sb);
++
++	if (replicas) {
++		struct bch_replicas_entry *r;
++
++		for_each_replicas_entry(replicas, r)
++			for (i = 0; i < r->nr_devs; i++)
++				if (r->devs[i] == dev)
++					data_has |= 1 << r->data_type;
++	} else if (replicas_v0) {
++		struct bch_replicas_entry_v0 *r;
++
++		for_each_replicas_entry_v0(replicas_v0, r)
++			for (i = 0; i < r->nr_devs; i++)
++				if (r->devs[i] == dev)
++					data_has |= 1 << r->data_type;
++	}
++
++
++	return data_has;
++}
++
++unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
++{
++	unsigned ret;
++
++	mutex_lock(&c->sb_lock);
++	ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
++	mutex_unlock(&c->sb_lock);
++
++	return ret;
++}
++
++void bch2_fs_replicas_exit(struct bch_fs *c)
++{
++	unsigned i;
++
++	kfree(c->usage_scratch);
++	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
++		free_percpu(c->usage[i]);
++	kfree(c->usage_base);
++	kfree(c->replicas.entries);
++	kfree(c->replicas_gc.entries);
++
++	mempool_exit(&c->replicas_delta_pool);
++}
++
++int bch2_fs_replicas_init(struct bch_fs *c)
++{
++	bch2_journal_entry_res_resize(&c->journal,
++			&c->replicas_journal_res,
++			reserve_journal_replicas(c, &c->replicas));
++
++	return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
++					 REPLICAS_DELTA_LIST_MAX) ?:
++		replicas_table_update(c, &c->replicas);
++}
+diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
+new file mode 100644
+index 000000000000..87820b2e1ad3
+--- /dev/null
++++ b/fs/bcachefs/replicas.h
+@@ -0,0 +1,106 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_REPLICAS_H
++#define _BCACHEFS_REPLICAS_H
++
++#include "eytzinger.h"
++#include "replicas_types.h"
++
++void bch2_replicas_entry_sort(struct bch_replicas_entry *);
++void bch2_replicas_entry_to_text(struct printbuf *,
++				 struct bch_replicas_entry *);
++void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
++
++static inline struct bch_replicas_entry *
++cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
++{
++	return (void *) r->entries + r->entry_size * i;
++}
++
++int bch2_replicas_entry_idx(struct bch_fs *,
++			    struct bch_replicas_entry *);
++
++void bch2_devlist_to_replicas(struct bch_replicas_entry *,
++			      enum bch_data_type,
++			      struct bch_devs_list);
++bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
++int bch2_mark_replicas(struct bch_fs *,
++		       struct bch_replicas_entry *);
++
++struct replicas_delta {
++	s64			delta;
++	struct bch_replicas_entry r;
++} __packed;
++
++struct replicas_delta_list {
++	unsigned		size;
++	unsigned		used;
++
++	struct			{} memset_start;
++	u64			nr_inodes;
++	u64			persistent_reserved[BCH_REPLICAS_MAX];
++	struct			{} memset_end;
++	struct replicas_delta	d[0];
++};
++
++static inline struct replicas_delta *
++replicas_delta_next(struct replicas_delta *d)
++{
++	return (void *) d + replicas_entry_bytes(&d->r) + 8;
++}
++
++int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
++
++void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
++
++static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
++					      unsigned dev)
++{
++	e->data_type	= BCH_DATA_cached;
++	e->nr_devs	= 1;
++	e->nr_required	= 1;
++	e->devs[0]	= dev;
++}
++
++bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
++			   unsigned, bool);
++
++unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
++unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
++
++int bch2_replicas_gc_end(struct bch_fs *, int);
++int bch2_replicas_gc_start(struct bch_fs *, unsigned);
++int bch2_replicas_gc2(struct bch_fs *);
++
++int bch2_replicas_set_usage(struct bch_fs *,
++			    struct bch_replicas_entry *,
++			    u64);
++
++#define for_each_cpu_replicas_entry(_r, _i)				\
++	for (_i = (_r)->entries;					\
++	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
++	     _i = (void *) (_i) + (_r)->entry_size)
++
++/* iterate over superblock replicas - used by userspace tools: */
++
++#define replicas_entry_next(_i)						\
++	((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
++
++#define for_each_replicas_entry(_r, _i)					\
++	for (_i = (_r)->entries;					\
++	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
++	     (_i) = replicas_entry_next(_i))
++
++#define for_each_replicas_entry_v0(_r, _i)				\
++	for (_i = (_r)->entries;					\
++	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
++	     (_i) = replicas_entry_next(_i))
++
++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
++
++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
++
++void bch2_fs_replicas_exit(struct bch_fs *);
++int bch2_fs_replicas_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_REPLICAS_H */
+diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
+new file mode 100644
+index 000000000000..0535b1d3760e
+--- /dev/null
++++ b/fs/bcachefs/replicas_types.h
+@@ -0,0 +1,10 @@
++#ifndef _BCACHEFS_REPLICAS_TYPES_H
++#define _BCACHEFS_REPLICAS_TYPES_H
++
++struct bch_replicas_cpu {
++	unsigned		nr;
++	unsigned		entry_size;
++	struct bch_replicas_entry *entries;
++};
++
++#endif /* _BCACHEFS_REPLICAS_TYPES_H */
+diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
+new file mode 100644
+index 000000000000..c062edb3fbc2
+--- /dev/null
++++ b/fs/bcachefs/siphash.c
+@@ -0,0 +1,173 @@
++// SPDX-License-Identifier: BSD-3-Clause
++/*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
++
++/*-
++ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ *    notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ *    notice, this list of conditions and the following disclaimer in the
++ *    documentation and/or other materials provided with the distribution.
++ * 3. The name of the author may not be used to endorse or promote
++ *    products derived from this software without specific prior written
++ *    permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
++ * SUCH DAMAGE.
++ */
++
++/*
++ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
++ * are the number of compression rounds and the number of finalization rounds.
++ * A compression round is identical to a finalization round and this round
++ * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
++ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
++ *
++ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
++ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
++ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
++ * https://131002.net/siphash/siphash.pdf
++ * https://131002.net/siphash/
++ */
++
++#include <asm/byteorder.h>
++#include <asm/unaligned.h>
++#include <linux/bitops.h>
++#include <linux/string.h>
++
++#include "siphash.h"
++
++static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
++{
++	while (rounds--) {
++		ctx->v[0] += ctx->v[1];
++		ctx->v[2] += ctx->v[3];
++		ctx->v[1] = rol64(ctx->v[1], 13);
++		ctx->v[3] = rol64(ctx->v[3], 16);
++
++		ctx->v[1] ^= ctx->v[0];
++		ctx->v[3] ^= ctx->v[2];
++		ctx->v[0] = rol64(ctx->v[0], 32);
++
++		ctx->v[2] += ctx->v[1];
++		ctx->v[0] += ctx->v[3];
++		ctx->v[1] = rol64(ctx->v[1], 17);
++		ctx->v[3] = rol64(ctx->v[3], 21);
++
++		ctx->v[1] ^= ctx->v[2];
++		ctx->v[3] ^= ctx->v[0];
++		ctx->v[2] = rol64(ctx->v[2], 32);
++	}
++}
++
++static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
++{
++	u64 m = get_unaligned_le64(ptr);
++
++	ctx->v[3] ^= m;
++	SipHash_Rounds(ctx, rounds);
++	ctx->v[0] ^= m;
++}
++
++void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
++{
++	u64 k0, k1;
++
++	k0 = le64_to_cpu(key->k0);
++	k1 = le64_to_cpu(key->k1);
++
++	ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
++	ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
++	ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
++	ctx->v[3] = 0x7465646279746573ULL ^ k1;
++
++	memset(ctx->buf, 0, sizeof(ctx->buf));
++	ctx->bytes = 0;
++}
++
++void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
++		    const void *src, size_t len)
++{
++	const u8 *ptr = src;
++	size_t left, used;
++
++	if (len == 0)
++		return;
++
++	used = ctx->bytes % sizeof(ctx->buf);
++	ctx->bytes += len;
++
++	if (used > 0) {
++		left = sizeof(ctx->buf) - used;
++
++		if (len >= left) {
++			memcpy(&ctx->buf[used], ptr, left);
++			SipHash_CRounds(ctx, ctx->buf, rc);
++			len -= left;
++			ptr += left;
++		} else {
++			memcpy(&ctx->buf[used], ptr, len);
++			return;
++		}
++	}
++
++	while (len >= sizeof(ctx->buf)) {
++		SipHash_CRounds(ctx, ptr, rc);
++		len -= sizeof(ctx->buf);
++		ptr += sizeof(ctx->buf);
++	}
++
++	if (len > 0)
++		memcpy(&ctx->buf[used], ptr, len);
++}
++
++void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
++{
++	u64 r;
++
++	r = SipHash_End(ctx, rc, rf);
++
++	*((__le64 *) dst) = cpu_to_le64(r);
++}
++
++u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
++{
++	u64 r;
++	size_t left, used;
++
++	used = ctx->bytes % sizeof(ctx->buf);
++	left = sizeof(ctx->buf) - used;
++	memset(&ctx->buf[used], 0, left - 1);
++	ctx->buf[7] = ctx->bytes;
++
++	SipHash_CRounds(ctx, ctx->buf, rc);
++	ctx->v[2] ^= 0xff;
++	SipHash_Rounds(ctx, rf);
++
++	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
++	memset(ctx, 0, sizeof(*ctx));
++	return (r);
++}
++
++u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
++{
++	SIPHASH_CTX ctx;
++
++	SipHash_Init(&ctx, key);
++	SipHash_Update(&ctx, rc, rf, src, len);
++	return SipHash_End(&ctx, rc, rf);
++}
+diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
+new file mode 100644
+index 000000000000..3dfaf34a43b2
+--- /dev/null
++++ b/fs/bcachefs/siphash.h
+@@ -0,0 +1,87 @@
++/* SPDX-License-Identifier: BSD-3-Clause */
++/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
++/*-
++ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ *    notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ *    notice, this list of conditions and the following disclaimer in the
++ *    documentation and/or other materials provided with the distribution.
++ * 3. The name of the author may not be used to endorse or promote
++ *    products derived from this software without specific prior written
++ *    permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
++ * SUCH DAMAGE.
++ *
++ * $FreeBSD$
++ */
++
++/*
++ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
++ * optimized for speed on short messages returning a 64bit hash/digest value.
++ *
++ * The number of rounds is defined during the initialization:
++ *  SipHash24_Init() for the fast and resonable strong version
++ *  SipHash48_Init() for the strong version (half as fast)
++ *
++ * struct SIPHASH_CTX ctx;
++ * SipHash24_Init(&ctx);
++ * SipHash_SetKey(&ctx, "16bytes long key");
++ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
++ * SipHash_Final(output, &ctx);
++ */
++
++#ifndef _SIPHASH_H_
++#define _SIPHASH_H_
++
++#include <linux/types.h>
++
++#define SIPHASH_BLOCK_LENGTH	 8
++#define SIPHASH_KEY_LENGTH	16
++#define SIPHASH_DIGEST_LENGTH	 8
++
++typedef struct _SIPHASH_CTX {
++	u64		v[4];
++	u8		buf[SIPHASH_BLOCK_LENGTH];
++	u32		bytes;
++} SIPHASH_CTX;
++
++typedef struct {
++	__le64		k0;
++	__le64		k1;
++} SIPHASH_KEY;
++
++void	SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
++void	SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
++u64	SipHash_End(SIPHASH_CTX *, int, int);
++void	SipHash_Final(void *, SIPHASH_CTX *, int, int);
++u64	SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
++
++#define SipHash24_Init(_c, _k)		SipHash_Init((_c), (_k))
++#define SipHash24_Update(_c, _p, _l)	SipHash_Update((_c), 2, 4, (_p), (_l))
++#define SipHash24_End(_d)		SipHash_End((_d), 2, 4)
++#define SipHash24_Final(_d, _c)		SipHash_Final((_d), (_c), 2, 4)
++#define SipHash24(_k, _p, _l)		SipHash((_k), 2, 4, (_p), (_l))
++
++#define SipHash48_Init(_c, _k)		SipHash_Init((_c), (_k))
++#define SipHash48_Update(_c, _p, _l)	SipHash_Update((_c), 4, 8, (_p), (_l))
++#define SipHash48_End(_d)		SipHash_End((_d), 4, 8)
++#define SipHash48_Final(_d, _c)		SipHash_Final((_d), (_c), 4, 8)
++#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
++
++#endif /* _SIPHASH_H_ */
+diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
+new file mode 100644
+index 000000000000..591bbb9f8beb
+--- /dev/null
++++ b/fs/bcachefs/str_hash.h
+@@ -0,0 +1,351 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_STR_HASH_H
++#define _BCACHEFS_STR_HASH_H
++
++#include "btree_iter.h"
++#include "btree_update.h"
++#include "checksum.h"
++#include "error.h"
++#include "inode.h"
++#include "siphash.h"
++#include "subvolume.h"
++#include "super.h"
++
++#include <linux/crc32c.h>
++#include <crypto/hash.h>
++#include <crypto/sha2.h>
++
++static inline enum bch_str_hash_type
++bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
++{
++	switch (opt) {
++	case BCH_STR_HASH_OPT_crc32c:
++		return BCH_STR_HASH_crc32c;
++	case BCH_STR_HASH_OPT_crc64:
++		return BCH_STR_HASH_crc64;
++	case BCH_STR_HASH_OPT_siphash:
++		return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
++			? BCH_STR_HASH_siphash
++			: BCH_STR_HASH_siphash_old;
++	default:
++	     BUG();
++	}
++}
++
++struct bch_hash_info {
++	u8			type;
++	/*
++	 * For crc32 or crc64 string hashes the first key value of
++	 * the siphash_key (k0) is used as the key.
++	 */
++	SIPHASH_KEY	siphash_key;
++};
++
++static inline struct bch_hash_info
++bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
++{
++	/* XXX ick */
++	struct bch_hash_info info = {
++		.type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
++			~(~0U << INODE_STR_HASH_BITS),
++		.siphash_key = { .k0 = bi->bi_hash_seed }
++	};
++
++	if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
++		SHASH_DESC_ON_STACK(desc, c->sha256);
++		u8 digest[SHA256_DIGEST_SIZE];
++
++		desc->tfm = c->sha256;
++
++		crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
++				    sizeof(bi->bi_hash_seed), digest);
++		memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
++	}
++
++	return info;
++}
++
++struct bch_str_hash_ctx {
++	union {
++		u32		crc32c;
++		u64		crc64;
++		SIPHASH_CTX	siphash;
++	};
++};
++
++static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
++				     const struct bch_hash_info *info)
++{
++	switch (info->type) {
++	case BCH_STR_HASH_crc32c:
++		ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
++				     sizeof(info->siphash_key.k0));
++		break;
++	case BCH_STR_HASH_crc64:
++		ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
++				      sizeof(info->siphash_key.k0));
++		break;
++	case BCH_STR_HASH_siphash_old:
++	case BCH_STR_HASH_siphash:
++		SipHash24_Init(&ctx->siphash, &info->siphash_key);
++		break;
++	default:
++		BUG();
++	}
++}
++
++static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
++				       const struct bch_hash_info *info,
++				       const void *data, size_t len)
++{
++	switch (info->type) {
++	case BCH_STR_HASH_crc32c:
++		ctx->crc32c = crc32c(ctx->crc32c, data, len);
++		break;
++	case BCH_STR_HASH_crc64:
++		ctx->crc64 = crc64_be(ctx->crc64, data, len);
++		break;
++	case BCH_STR_HASH_siphash_old:
++	case BCH_STR_HASH_siphash:
++		SipHash24_Update(&ctx->siphash, data, len);
++		break;
++	default:
++		BUG();
++	}
++}
++
++static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
++				   const struct bch_hash_info *info)
++{
++	switch (info->type) {
++	case BCH_STR_HASH_crc32c:
++		return ctx->crc32c;
++	case BCH_STR_HASH_crc64:
++		return ctx->crc64 >> 1;
++	case BCH_STR_HASH_siphash_old:
++	case BCH_STR_HASH_siphash:
++		return SipHash24_End(&ctx->siphash) >> 1;
++	default:
++		BUG();
++	}
++}
++
++struct bch_hash_desc {
++	enum btree_id	btree_id;
++	u8		key_type;
++
++	u64		(*hash_key)(const struct bch_hash_info *, const void *);
++	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
++	bool		(*cmp_key)(struct bkey_s_c, const void *);
++	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
++	bool		(*is_visible)(subvol_inum inum, struct bkey_s_c);
++};
++
++static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
++{
++	return k.k->type == desc.key_type &&
++		(!desc.is_visible || desc.is_visible(inum, k));
++}
++
++static __always_inline int
++bch2_hash_lookup(struct btree_trans *trans,
++		 struct btree_iter *iter,
++		 const struct bch_hash_desc desc,
++		 const struct bch_hash_info *info,
++		 subvol_inum inum, const void *key,
++		 unsigned flags)
++{
++	struct bkey_s_c k;
++	u32 snapshot;
++	int ret;
++
++	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
++	if (ret)
++		return ret;
++
++	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
++			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
++			   POS(inum.inum, U64_MAX),
++			   BTREE_ITER_SLOTS|flags, k, ret) {
++		if (is_visible_key(desc, inum, k)) {
++			if (!desc.cmp_key(k, key))
++				return 0;
++		} else if (k.k->type == KEY_TYPE_hash_whiteout) {
++			;
++		} else {
++			/* hole, not found */
++			break;
++		}
++	}
++	bch2_trans_iter_exit(trans, iter);
++
++	return ret ?: -ENOENT;
++}
++
++static __always_inline int
++bch2_hash_hole(struct btree_trans *trans,
++	       struct btree_iter *iter,
++	       const struct bch_hash_desc desc,
++	       const struct bch_hash_info *info,
++	       subvol_inum inum, const void *key)
++{
++	struct bkey_s_c k;
++	u32 snapshot;
++	int ret;
++
++	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
++	if (ret)
++		return ret;
++
++	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
++			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
++			   POS(inum.inum, U64_MAX),
++			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
++		if (!is_visible_key(desc, inum, k))
++			return 0;
++	bch2_trans_iter_exit(trans, iter);
++
++	return ret ?: -ENOSPC;
++}
++
++static __always_inline
++int bch2_hash_needs_whiteout(struct btree_trans *trans,
++			     const struct bch_hash_desc desc,
++			     const struct bch_hash_info *info,
++			     struct btree_iter *start)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_copy_iter(&iter, start);
++
++	bch2_btree_iter_advance(&iter);
++
++	for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
++		if (k.k->type != desc.key_type &&
++		    k.k->type != KEY_TYPE_hash_whiteout)
++			break;
++
++		if (k.k->type == desc.key_type &&
++		    desc.hash_bkey(info, k) <= start->pos.offset) {
++			ret = 1;
++			break;
++		}
++	}
++
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static __always_inline
++int bch2_hash_set(struct btree_trans *trans,
++		  const struct bch_hash_desc desc,
++		  const struct bch_hash_info *info,
++		  subvol_inum inum,
++		  struct bkey_i *insert, int flags)
++{
++	struct btree_iter iter, slot = { NULL };
++	struct bkey_s_c k;
++	bool found = false;
++	u32 snapshot;
++	int ret;
++
++	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
++	if (ret)
++		return ret;
++
++	for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
++			   SPOS(inum.inum,
++				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
++				snapshot),
++			   POS(inum.inum, U64_MAX),
++			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
++		if (is_visible_key(desc, inum, k)) {
++			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
++				goto found;
++
++			/* hash collision: */
++			continue;
++		}
++
++		if (!slot.path &&
++		    !(flags & BCH_HASH_SET_MUST_REPLACE))
++			bch2_trans_copy_iter(&slot, &iter);
++
++		if (k.k->type != KEY_TYPE_hash_whiteout)
++			goto not_found;
++	}
++
++	if (!ret)
++		ret = -ENOSPC;
++out:
++	bch2_trans_iter_exit(trans, &slot);
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++found:
++	found = true;
++not_found:
++
++	if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
++		ret = -ENOENT;
++	} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
++		ret = -EEXIST;
++	} else {
++		if (!found && slot.path)
++			swap(iter, slot);
++
++		insert->k.p = iter.pos;
++		ret = bch2_trans_update(trans, &iter, insert, 0);
++	}
++
++	goto out;
++}
++
++static __always_inline
++int bch2_hash_delete_at(struct btree_trans *trans,
++			const struct bch_hash_desc desc,
++			const struct bch_hash_info *info,
++			struct btree_iter *iter,
++			unsigned update_flags)
++{
++	struct bkey_i *delete;
++	int ret;
++
++	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
++	ret = PTR_ERR_OR_ZERO(delete);
++	if (ret)
++		return ret;
++
++	ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
++	if (ret < 0)
++		return ret;
++
++	bkey_init(&delete->k);
++	delete->k.p = iter->pos;
++	delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
++
++	return bch2_trans_update(trans, iter, delete, update_flags);
++}
++
++static __always_inline
++int bch2_hash_delete(struct btree_trans *trans,
++		     const struct bch_hash_desc desc,
++		     const struct bch_hash_info *info,
++		     subvol_inum inum, const void *key)
++{
++	struct btree_iter iter;
++	int ret;
++
++	ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
++				BTREE_ITER_INTENT);
++	if (ret)
++		return ret;
++
++	ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++#endif /* _BCACHEFS_STR_HASH_H */
+diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
+new file mode 100644
+index 000000000000..b5b0f5e39f97
+--- /dev/null
++++ b/fs/bcachefs/subvolume.c
+@@ -0,0 +1,1108 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "btree_key_cache.h"
++#include "btree_update.h"
++#include "errcode.h"
++#include "error.h"
++#include "fs.h"
++#include "subvolume.h"
++
++/* Snapshot tree: */
++
++void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
++			   struct bkey_s_c k)
++{
++	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
++
++	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
++	       BCH_SNAPSHOT_SUBVOL(s.v),
++	       BCH_SNAPSHOT_DELETED(s.v),
++	       le32_to_cpu(s.v->parent),
++	       le32_to_cpu(s.v->children[0]),
++	       le32_to_cpu(s.v->children[1]),
++	       le32_to_cpu(s.v->subvol));
++}
++
++int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			  int rw, struct printbuf *err)
++{
++	struct bkey_s_c_snapshot s;
++	u32 i, id;
++
++	if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
++	    bkey_cmp(k.k->p, POS(0, 1)) < 0) {
++		prt_printf(err, "bad pos");
++		return -EINVAL;
++	}
++
++	if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
++		prt_printf(err, "bad val size (%zu != %zu)",
++		       bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
++		return -EINVAL;
++	}
++
++	s = bkey_s_c_to_snapshot(k);
++
++	id = le32_to_cpu(s.v->parent);
++	if (id && id <= k.k->p.offset) {
++		prt_printf(err, "bad parent node (%u <= %llu)",
++		       id, k.k->p.offset);
++		return -EINVAL;
++	}
++
++	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
++		prt_printf(err, "children not normalized");
++		return -EINVAL;
++	}
++
++	if (s.v->children[0] &&
++	    s.v->children[0] == s.v->children[1]) {
++		prt_printf(err, "duplicate child nodes");
++		return -EINVAL;
++	}
++
++	for (i = 0; i < 2; i++) {
++		id = le32_to_cpu(s.v->children[i]);
++
++		if (id >= k.k->p.offset) {
++			prt_printf(err, "bad child node (%u >= %llu)",
++			       id, k.k->p.offset);
++			return -EINVAL;
++		}
++	}
++
++	return 0;
++}
++
++int bch2_mark_snapshot(struct btree_trans *trans,
++		       struct bkey_s_c old, struct bkey_s_c new,
++		       unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct snapshot_t *t;
++
++	t = genradix_ptr_alloc(&c->snapshots,
++			       U32_MAX - new.k->p.offset,
++			       GFP_KERNEL);
++	if (!t)
++		return -ENOMEM;
++
++	if (new.k->type == KEY_TYPE_snapshot) {
++		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
++
++		t->parent	= le32_to_cpu(s.v->parent);
++		t->children[0]	= le32_to_cpu(s.v->children[0]);
++		t->children[1]	= le32_to_cpu(s.v->children[1]);
++		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
++	} else {
++		t->parent	= 0;
++		t->children[0]	= 0;
++		t->children[1]	= 0;
++		t->subvol	= 0;
++	}
++
++	return 0;
++}
++
++static int snapshot_lookup(struct btree_trans *trans, u32 id,
++			   struct bch_snapshot *s)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
++			     BTREE_ITER_WITH_UPDATES);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
++
++	if (!ret)
++		*s = *bkey_s_c_to_snapshot(k).v;
++
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int snapshot_live(struct btree_trans *trans, u32 id)
++{
++	struct bch_snapshot v;
++	int ret;
++
++	if (!id)
++		return 0;
++
++	ret = snapshot_lookup(trans, id, &v);
++	if (ret == -ENOENT)
++		bch_err(trans->c, "snapshot node %u not found", id);
++	if (ret)
++		return ret;
++
++	return !BCH_SNAPSHOT_DELETED(&v);
++}
++
++static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	unsigned i, nr_live = 0, live_idx = 0;
++	struct bkey_s_c_snapshot snap;
++	u32 id = k.k->p.offset, child[2];
++
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
++
++	snap = bkey_s_c_to_snapshot(k);
++
++	child[0] = le32_to_cpu(snap.v->children[0]);
++	child[1] = le32_to_cpu(snap.v->children[1]);
++
++	for (i = 0; i < 2; i++) {
++		int ret = snapshot_live(trans, child[i]);
++		if (ret < 0)
++			return ret;
++
++		if (ret)
++			live_idx = i;
++		nr_live += ret;
++	}
++
++	snapshot_t(c, id)->equiv = nr_live == 1
++		? snapshot_t(c, child[live_idx])->equiv
++		: id;
++	return 0;
++}
++
++/* fsck: */
++static int check_snapshot(struct btree_trans *trans,
++			  struct btree_iter *iter,
++			  struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c_snapshot s;
++	struct bch_subvolume subvol;
++	struct bch_snapshot v;
++	struct printbuf buf = PRINTBUF;
++	bool should_have_subvol;
++	u32 i, id;
++	int ret = 0;
++
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
++
++	s = bkey_s_c_to_snapshot(k);
++	id = le32_to_cpu(s.v->parent);
++	if (id) {
++		ret = snapshot_lookup(trans, id, &v);
++		if (ret == -ENOENT)
++			bch_err(c, "snapshot with nonexistent parent:\n  %s",
++				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
++		if (ret)
++			goto err;
++
++		if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
++		    le32_to_cpu(v.children[1]) != s.k->p.offset) {
++			bch_err(c, "snapshot parent %u missing pointer to child %llu",
++				id, s.k->p.offset);
++			ret = -EINVAL;
++			goto err;
++		}
++	}
++
++	for (i = 0; i < 2 && s.v->children[i]; i++) {
++		id = le32_to_cpu(s.v->children[i]);
++
++		ret = snapshot_lookup(trans, id, &v);
++		if (ret == -ENOENT)
++			bch_err(c, "snapshot node %llu has nonexistent child %u",
++				s.k->p.offset, id);
++		if (ret)
++			goto err;
++
++		if (le32_to_cpu(v.parent) != s.k->p.offset) {
++			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
++				id, le32_to_cpu(v.parent), s.k->p.offset);
++			ret = -EINVAL;
++			goto err;
++		}
++	}
++
++	should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
++		!BCH_SNAPSHOT_DELETED(s.v);
++
++	if (should_have_subvol) {
++		id = le32_to_cpu(s.v->subvol);
++		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
++		if (ret == -ENOENT)
++			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
++				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
++		if (ret)
++			goto err;
++
++		if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
++			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
++				s.k->p.offset);
++			ret = -EINVAL;
++			goto err;
++		}
++	} else {
++		if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n  %s",
++				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
++			struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
++
++			ret = PTR_ERR_OR_ZERO(u);
++			if (ret)
++				goto err;
++
++			bkey_reassemble(&u->k_i, s.s_c);
++			u->v.subvol = 0;
++			ret = bch2_trans_update(trans, iter, &u->k_i, 0);
++			if (ret)
++				goto err;
++		}
++	}
++
++	if (BCH_SNAPSHOT_DELETED(s.v))
++		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++err:
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_fs_check_snapshots(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
++			POS(BCACHEFS_ROOT_INO, 0),
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_snapshot(&trans, &iter, k));
++
++	if (ret)
++		bch_err(c, "error %i checking snapshots", ret);
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int check_subvol(struct btree_trans *trans,
++			struct btree_iter *iter,
++			struct bkey_s_c k)
++{
++	struct bkey_s_c_subvolume subvol;
++	struct bch_snapshot snapshot;
++	unsigned snapid;
++	int ret;
++
++	if (k.k->type != KEY_TYPE_subvolume)
++		return 0;
++
++	subvol = bkey_s_c_to_subvolume(k);
++	snapid = le32_to_cpu(subvol.v->snapshot);
++	ret = snapshot_lookup(trans, snapid, &snapshot);
++
++	if (ret == -ENOENT)
++		bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
++			k.k->p.offset, snapid);
++	if (ret)
++		return ret;
++
++	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
++		ret = bch2_subvolume_delete(trans, iter->pos.offset);
++		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			bch_err(trans->c, "error deleting subvolume %llu: %s",
++				iter->pos.offset, bch2_err_str(ret));
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++int bch2_fs_check_subvols(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_subvol(&trans, &iter, k));
++
++	bch2_trans_exit(&trans);
++
++	return ret;
++}
++
++void bch2_fs_snapshots_exit(struct bch_fs *c)
++{
++	genradix_free(&c->snapshots);
++}
++
++int bch2_fs_snapshots_start(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k,
++		bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
++		bch2_snapshot_set_equiv(&trans, k));
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
++	return ret;
++}
++
++/*
++ * Mark a snapshot as deleted, for future cleanup:
++ */
++static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i_snapshot *s;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
++			     BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_snapshot) {
++		bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
++		ret = -ENOENT;
++		goto err;
++	}
++
++	/* already deleted? */
++	if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
++		goto err;
++
++	s = bch2_trans_kmalloc(trans, sizeof(*s));
++	ret = PTR_ERR_OR_ZERO(s);
++	if (ret)
++		goto err;
++
++	bkey_reassemble(&s->k_i, k);
++	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
++	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
++	s->v.subvol = 0;
++
++	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
++	if (ret)
++		goto err;
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
++{
++	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
++	struct bkey_s_c k;
++	struct bkey_s_c_snapshot s;
++	struct bkey_i_snapshot *parent;
++	u32 parent_id;
++	unsigned i;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
++			     BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_snapshot) {
++		bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
++		ret = -ENOENT;
++		goto err;
++	}
++
++	s = bkey_s_c_to_snapshot(k);
++
++	BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
++	parent_id = le32_to_cpu(s.v->parent);
++
++	if (parent_id) {
++		bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
++				     POS(0, parent_id),
++				     BTREE_ITER_INTENT);
++		k = bch2_btree_iter_peek_slot(&p_iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (k.k->type != KEY_TYPE_snapshot) {
++			bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
++			ret = -ENOENT;
++			goto err;
++		}
++
++		parent = bch2_trans_kmalloc(trans, sizeof(*parent));
++		ret = PTR_ERR_OR_ZERO(parent);
++		if (ret)
++			goto err;
++
++		bkey_reassemble(&parent->k_i, k);
++
++		for (i = 0; i < 2; i++)
++			if (le32_to_cpu(parent->v.children[i]) == id)
++				break;
++
++		if (i == 2)
++			bch_err(trans->c, "snapshot %u missing child pointer to %u",
++				parent_id, id);
++		else
++			parent->v.children[i] = 0;
++
++		if (le32_to_cpu(parent->v.children[0]) <
++		    le32_to_cpu(parent->v.children[1]))
++			swap(parent->v.children[0],
++			     parent->v.children[1]);
++
++		ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
++		if (ret)
++			goto err;
++	}
++
++	ret = bch2_btree_delete_at(trans, &iter, 0);
++err:
++	bch2_trans_iter_exit(trans, &p_iter);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
++			      u32 *new_snapids,
++			      u32 *snapshot_subvols,
++			      unsigned nr_snapids)
++{
++	struct btree_iter iter;
++	struct bkey_i_snapshot *n;
++	struct bkey_s_c k;
++	unsigned i;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
++			     POS_MIN, BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	for (i = 0; i < nr_snapids; i++) {
++		k = bch2_btree_iter_prev_slot(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (!k.k || !k.k->p.offset) {
++			ret = -ENOSPC;
++			goto err;
++		}
++
++		n = bch2_trans_kmalloc(trans, sizeof(*n));
++		ret = PTR_ERR_OR_ZERO(n);
++		if (ret)
++			goto err;
++
++		bkey_snapshot_init(&n->k_i);
++		n->k.p		= iter.pos;
++		n->v.flags	= 0;
++		n->v.parent	= cpu_to_le32(parent);
++		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
++		n->v.pad	= 0;
++		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
++
++		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
++			bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
++		if (ret)
++			goto err;
++
++		new_snapids[i]	= iter.pos.offset;
++	}
++
++	if (parent) {
++		bch2_btree_iter_set_pos(&iter, POS(0, parent));
++		k = bch2_btree_iter_peek(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (k.k->type != KEY_TYPE_snapshot) {
++			bch_err(trans->c, "snapshot %u not found", parent);
++			ret = -ENOENT;
++			goto err;
++		}
++
++		n = bch2_trans_kmalloc(trans, sizeof(*n));
++		ret = PTR_ERR_OR_ZERO(n);
++		if (ret)
++			goto err;
++
++		bkey_reassemble(&n->k_i, k);
++
++		if (n->v.children[0] || n->v.children[1]) {
++			bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
++			ret = -EINVAL;
++			goto err;
++		}
++
++		n->v.children[0] = cpu_to_le32(new_snapids[0]);
++		n->v.children[1] = cpu_to_le32(new_snapids[1]);
++		n->v.subvol = 0;
++		SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
++		ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
++		if (ret)
++			goto err;
++	}
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int snapshot_delete_key(struct btree_trans *trans,
++			       struct btree_iter *iter,
++			       struct bkey_s_c k,
++			       snapshot_id_list *deleted,
++			       snapshot_id_list *equiv_seen,
++			       struct bpos *last_pos)
++{
++	struct bch_fs *c = trans->c;
++	u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
++
++	if (bkey_cmp(k.k->p, *last_pos))
++		equiv_seen->nr = 0;
++	*last_pos = k.k->p;
++
++	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
++	    snapshot_list_has_id(equiv_seen, equiv)) {
++		return bch2_btree_delete_at(trans, iter,
++					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++	} else {
++		return snapshot_list_add(c, equiv_seen, equiv);
++	}
++}
++
++static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
++					  struct bkey_s_c k)
++{
++	struct bkey_s_c_snapshot snap;
++	u32 children[2];
++	int ret;
++
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
++
++	snap = bkey_s_c_to_snapshot(k);
++	if (BCH_SNAPSHOT_DELETED(snap.v) ||
++	    BCH_SNAPSHOT_SUBVOL(snap.v))
++		return 0;
++
++	children[0] = le32_to_cpu(snap.v->children[0]);
++	children[1] = le32_to_cpu(snap.v->children[1]);
++
++	ret   = snapshot_live(trans, children[0]) ?:
++		snapshot_live(trans, children[1]);
++	if (ret < 0)
++		return ret;
++
++	if (!ret)
++		return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
++	return 0;
++}
++
++int bch2_delete_dead_snapshots(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_s_c_snapshot snap;
++	snapshot_id_list deleted = { 0 };
++	u32 i, id;
++	int ret = 0;
++
++	if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
++		return 0;
++
++	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
++		ret = bch2_fs_read_write_early(c);
++		if (ret) {
++			bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
++			return ret;
++		}
++	}
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	/*
++	 * For every snapshot node: If we have no live children and it's not
++	 * pointed to by a subvolume, delete it:
++	 */
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
++			POS_MIN, 0, k,
++			NULL, NULL, 0,
++		bch2_delete_redundant_snapshot(&trans, &iter, k));
++	if (ret) {
++		bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k,
++		bch2_snapshot_set_equiv(&trans, k));
++	if (ret) {
++		bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k, ret) {
++		if (k.k->type != KEY_TYPE_snapshot)
++			continue;
++
++		snap = bkey_s_c_to_snapshot(k);
++		if (BCH_SNAPSHOT_DELETED(snap.v)) {
++			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
++			if (ret)
++				break;
++		}
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	if (ret) {
++		bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	for (id = 0; id < BTREE_ID_NR; id++) {
++		struct bpos last_pos = POS_MIN;
++		snapshot_id_list equiv_seen = { 0 };
++
++		if (!btree_type_has_snapshots(id))
++			continue;
++
++		ret = for_each_btree_key_commit(&trans, iter,
++				id, POS_MIN,
++				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++				NULL, NULL, BTREE_INSERT_NOFAIL,
++			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
++
++		darray_exit(&equiv_seen);
++
++		if (ret) {
++			bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
++			goto err;
++		}
++	}
++
++	for (i = 0; i < deleted.nr; i++) {
++		ret = commit_do(&trans, NULL, NULL, 0,
++			bch2_snapshot_node_delete(&trans, deleted.data[i]));
++		if (ret) {
++			bch_err(c, "error deleting snapshot %u: %s",
++				deleted.data[i], bch2_err_str(ret));
++			goto err;
++		}
++	}
++
++	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++err:
++	darray_exit(&deleted);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static void bch2_delete_dead_snapshots_work(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
++
++	bch2_delete_dead_snapshots(c);
++	percpu_ref_put(&c->writes);
++}
++
++void bch2_delete_dead_snapshots_async(struct bch_fs *c)
++{
++	if (!percpu_ref_tryget_live(&c->writes))
++		return;
++
++	if (!queue_work(system_long_wq, &c->snapshot_delete_work))
++		percpu_ref_put(&c->writes);
++}
++
++static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
++					   struct btree_trans_commit_hook *h)
++{
++	struct bch_fs *c = trans->c;
++
++	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++
++	if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
++		return 0;
++
++	bch2_delete_dead_snapshots_async(c);
++	return 0;
++}
++
++/* Subvolumes: */
++
++int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			   int rw, struct printbuf *err)
++{
++	if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
++	    bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
++		prt_printf(err, "invalid pos");
++		return -EINVAL;
++	}
++
++	if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
++		prt_printf(err, "incorrect value size (%zu != %zu)",
++		       bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
++			    struct bkey_s_c k)
++{
++	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
++
++	prt_printf(out, "root %llu snapshot id %u",
++	       le64_to_cpu(s.v->inode),
++	       le32_to_cpu(s.v->snapshot));
++}
++
++int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
++		       bool inconsistent_if_not_found,
++		       int iter_flags,
++		       struct bch_subvolume *s)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
++			     iter_flags);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
++
++	if (ret == -ENOENT && inconsistent_if_not_found)
++		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
++	if (!ret)
++		*s = *bkey_s_c_to_subvolume(k).v;
++
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
++			     struct bch_subvolume *subvol)
++{
++	struct bch_snapshot snap;
++
++	return  snapshot_lookup(trans, snapshot, &snap) ?:
++		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
++}
++
++int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
++				u32 *snapid)
++{
++	struct bch_subvolume s;
++	int ret;
++
++	ret = bch2_subvolume_get(trans, subvol, true,
++				 BTREE_ITER_CACHED|
++				 BTREE_ITER_WITH_UPDATES,
++				 &s);
++
++	*snapid = le32_to_cpu(s.snapshot);
++	return ret;
++}
++
++/*
++ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
++ * deletion/cleanup:
++ */
++int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_s_c_subvolume subvol;
++	struct btree_trans_commit_hook *h;
++	u32 snapid;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
++			     POS(0, subvolid),
++			     BTREE_ITER_CACHED|
++			     BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_subvolume) {
++		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
++		ret = -EIO;
++		goto err;
++	}
++
++	subvol = bkey_s_c_to_subvolume(k);
++	snapid = le32_to_cpu(subvol.v->snapshot);
++
++	ret = bch2_btree_delete_at(trans, &iter, 0);
++	if (ret)
++		goto err;
++
++	ret = bch2_snapshot_node_set_deleted(trans, snapid);
++
++	h = bch2_trans_kmalloc(trans, sizeof(*h));
++	ret = PTR_ERR_OR_ZERO(h);
++	if (ret)
++		goto err;
++
++	h->fn = bch2_delete_dead_snapshots_hook;
++	bch2_trans_commit_hook(trans, h);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work, struct bch_fs,
++				snapshot_wait_for_pagecache_and_delete_work);
++	snapshot_id_list s;
++	u32 *id;
++	int ret = 0;
++
++	while (!ret) {
++		mutex_lock(&c->snapshots_unlinked_lock);
++		s = c->snapshots_unlinked;
++		darray_init(&c->snapshots_unlinked);
++		mutex_unlock(&c->snapshots_unlinked_lock);
++
++		if (!s.nr)
++			break;
++
++		bch2_evict_subvolume_inodes(c, &s);
++
++		for (id = s.data; id < s.data + s.nr; id++) {
++			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
++				      bch2_subvolume_delete(&trans, *id));
++			if (ret) {
++				bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
++				break;
++			}
++		}
++
++		darray_exit(&s);
++	}
++
++	percpu_ref_put(&c->writes);
++}
++
++struct subvolume_unlink_hook {
++	struct btree_trans_commit_hook	h;
++	u32				subvol;
++};
++
++int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
++						      struct btree_trans_commit_hook *_h)
++{
++	struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
++	struct bch_fs *c = trans->c;
++	int ret = 0;
++
++	mutex_lock(&c->snapshots_unlinked_lock);
++	if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
++		ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
++	mutex_unlock(&c->snapshots_unlinked_lock);
++
++	if (ret)
++		return ret;
++
++	if (unlikely(!percpu_ref_tryget_live(&c->writes)))
++		return -EROFS;
++
++	if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
++		percpu_ref_put(&c->writes);
++	return 0;
++}
++
++int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i_subvolume *n;
++	struct subvolume_unlink_hook *h;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
++			     POS(0, subvolid),
++			     BTREE_ITER_CACHED|
++			     BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_subvolume) {
++		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
++		ret = -EIO;
++		goto err;
++	}
++
++	n = bch2_trans_kmalloc(trans, sizeof(*n));
++	ret = PTR_ERR_OR_ZERO(n);
++	if (ret)
++		goto err;
++
++	bkey_reassemble(&n->k_i, k);
++	SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
++
++	ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
++	if (ret)
++		goto err;
++
++	h = bch2_trans_kmalloc(trans, sizeof(*h));
++	ret = PTR_ERR_OR_ZERO(h);
++	if (ret)
++		goto err;
++
++	h->h.fn		= bch2_subvolume_wait_for_pagecache_and_delete_hook;
++	h->subvol	= subvolid;
++	bch2_trans_commit_hook(trans, &h->h);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
++			  u32 src_subvolid,
++			  u32 *new_subvolid,
++			  u32 *new_snapshotid,
++			  bool ro)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
++	struct bkey_i_subvolume *new_subvol = NULL;
++	struct bkey_i_subvolume *src_subvol = NULL;
++	struct bkey_s_c k;
++	u32 parent = 0, new_nodes[2], snapshot_subvols[2];
++	int ret = 0;
++
++	for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
++			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
++		if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
++			break;
++
++		/*
++		 * bch2_subvolume_delete() doesn't flush the btree key cache -
++		 * ideally it would but that's tricky
++		 */
++		if (bkey_deleted(k.k) &&
++		    !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
++			goto found_slot;
++	}
++
++	if (!ret)
++		ret = -ENOSPC;
++	goto err;
++found_slot:
++	snapshot_subvols[0] = dst_iter.pos.offset;
++	snapshot_subvols[1] = src_subvolid;
++
++	if (src_subvolid) {
++		/* Creating a snapshot: */
++		src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
++		ret = PTR_ERR_OR_ZERO(src_subvol);
++		if (ret)
++			goto err;
++
++		bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
++				     POS(0, src_subvolid),
++				     BTREE_ITER_CACHED|
++				     BTREE_ITER_INTENT);
++		k = bch2_btree_iter_peek_slot(&src_iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (k.k->type != KEY_TYPE_subvolume) {
++			bch_err(c, "subvolume %u not found", src_subvolid);
++			ret = -ENOENT;
++			goto err;
++		}
++
++		bkey_reassemble(&src_subvol->k_i, k);
++		parent = le32_to_cpu(src_subvol->v.snapshot);
++	}
++
++	ret = bch2_snapshot_node_create(trans, parent, new_nodes,
++					snapshot_subvols,
++					src_subvolid ? 2 : 1);
++	if (ret)
++		goto err;
++
++	if (src_subvolid) {
++		src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
++		ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
++		if (ret)
++			goto err;
++	}
++
++	new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
++	ret = PTR_ERR_OR_ZERO(new_subvol);
++	if (ret)
++		goto err;
++
++	bkey_subvolume_init(&new_subvol->k_i);
++	new_subvol->v.flags	= 0;
++	new_subvol->v.snapshot	= cpu_to_le32(new_nodes[0]);
++	new_subvol->v.inode	= cpu_to_le64(inode);
++	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
++	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
++	new_subvol->k.p		= dst_iter.pos;
++	ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
++	if (ret)
++		goto err;
++
++	*new_subvolid	= new_subvol->k.p.offset;
++	*new_snapshotid	= new_nodes[0];
++err:
++	bch2_trans_iter_exit(trans, &src_iter);
++	bch2_trans_iter_exit(trans, &dst_iter);
++	return ret;
++}
++
++int bch2_fs_subvolumes_init(struct bch_fs *c)
++{
++	INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
++	INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
++		  bch2_subvolume_wait_for_pagecache_and_delete);
++	mutex_init(&c->snapshots_unlinked_lock);
++	return 0;
++}
+diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
+new file mode 100644
+index 000000000000..02a636644988
+--- /dev/null
++++ b/fs/bcachefs/subvolume.h
+@@ -0,0 +1,137 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_SUBVOLUME_H
++#define _BCACHEFS_SUBVOLUME_H
++
++#include "darray.h"
++#include "subvolume_types.h"
++
++void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
++			  int rw, struct printbuf *);
++
++#define bch2_bkey_ops_snapshot (struct bkey_ops) {		\
++	.key_invalid	= bch2_snapshot_invalid,		\
++	.val_to_text	= bch2_snapshot_to_text,		\
++}
++
++int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
++		       struct bkey_s_c, unsigned);
++
++static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
++{
++	return genradix_ptr(&c->snapshots, U32_MAX - id);
++}
++
++static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
++{
++	return snapshot_t(c, id)->parent;
++}
++
++static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
++{
++	return snapshot_t(c, id)->equiv;
++}
++
++static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
++{
++	return id == snapshot_t(c, id)->equiv;
++}
++
++static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
++{
++	struct snapshot_t *s = snapshot_t(c, id);
++
++	return s->children[0] || s->children[1];
++}
++
++static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
++{
++	struct snapshot_t *s;
++	u32 parent = bch2_snapshot_parent(c, id);
++
++	if (!parent)
++		return 0;
++
++	s = snapshot_t(c, bch2_snapshot_parent(c, id));
++	if (id == s->children[0])
++		return s->children[1];
++	if (id == s->children[1])
++		return s->children[0];
++	return 0;
++}
++
++static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
++{
++	while (id && id < ancestor)
++		id = bch2_snapshot_parent(c, id);
++
++	return id == ancestor;
++}
++
++static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
++{
++	u32 *i;
++
++	darray_for_each(*s, i)
++		if (*i == id)
++			return true;
++	return false;
++}
++
++static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
++{
++	u32 *i;
++
++	darray_for_each(*s, i)
++		if (bch2_snapshot_is_ancestor(c, id, *i))
++			return true;
++	return false;
++}
++
++static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
++{
++	int ret;
++
++	BUG_ON(snapshot_list_has_id(s, id));
++	ret = darray_push(s, id);
++	if (ret)
++		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
++	return ret;
++}
++
++int bch2_fs_check_snapshots(struct bch_fs *);
++int bch2_fs_check_subvols(struct bch_fs *);
++
++void bch2_fs_snapshots_exit(struct bch_fs *);
++int bch2_fs_snapshots_start(struct bch_fs *);
++
++int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
++			   int rw, struct printbuf *);
++void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++
++#define bch2_bkey_ops_subvolume (struct bkey_ops) {		\
++	.key_invalid	= bch2_subvolume_invalid,		\
++	.val_to_text	= bch2_subvolume_to_text,		\
++}
++
++int bch2_subvolume_get(struct btree_trans *, unsigned,
++		       bool, int, struct bch_subvolume *);
++int bch2_snapshot_get_subvol(struct btree_trans *, u32,
++			     struct bch_subvolume *);
++int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
++
++/* only exported for tests: */
++int bch2_snapshot_node_create(struct btree_trans *, u32,
++			      u32 *, u32 *, unsigned);
++
++int bch2_delete_dead_snapshots(struct bch_fs *);
++void bch2_delete_dead_snapshots_async(struct bch_fs *);
++
++int bch2_subvolume_delete(struct btree_trans *, u32);
++int bch2_subvolume_unlink(struct btree_trans *, u32);
++int bch2_subvolume_create(struct btree_trans *, u64, u32,
++			  u32 *, u32 *, bool);
++
++int bch2_fs_subvolumes_init(struct bch_fs *);
++
++#endif /* _BCACHEFS_SUBVOLUME_H */
+diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
+new file mode 100644
+index 000000000000..f7562b5d51df
+--- /dev/null
++++ b/fs/bcachefs/subvolume_types.h
+@@ -0,0 +1,9 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
++#define _BCACHEFS_SUBVOLUME_TYPES_H
++
++#include "darray.h"
++
++typedef DARRAY(u32) snapshot_id_list;
++
++#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
+diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
+new file mode 100644
+index 000000000000..8b8130993a59
+--- /dev/null
++++ b/fs/bcachefs/super-io.c
+@@ -0,0 +1,1602 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "btree_update_interior.h"
++#include "buckets.h"
++#include "checksum.h"
++#include "disk_groups.h"
++#include "ec.h"
++#include "error.h"
++#include "io.h"
++#include "journal.h"
++#include "journal_io.h"
++#include "journal_sb.h"
++#include "journal_seq_blacklist.h"
++#include "replicas.h"
++#include "quota.h"
++#include "super-io.h"
++#include "super.h"
++#include "vstructs.h"
++#include "counters.h"
++
++#include <linux/backing-dev.h>
++#include <linux/pretty-printers.h>
++#include <linux/sort.h>
++
++#include <trace/events/bcachefs.h>
++
++const char * const bch2_sb_fields[] = {
++#define x(name, nr)	#name,
++	BCH_SB_FIELDS()
++#undef x
++	NULL
++};
++
++static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
++				  struct printbuf *);
++
++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
++				      enum bch_sb_field_type type)
++{
++	struct bch_sb_field *f;
++
++	/* XXX: need locking around superblock to access optional fields */
++
++	vstruct_for_each(sb, f)
++		if (le32_to_cpu(f->type) == type)
++			return f;
++	return NULL;
++}
++
++static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
++						   struct bch_sb_field *f,
++						   unsigned u64s)
++{
++	unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
++	unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
++
++	BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
++
++	if (!f && !u64s) {
++		/* nothing to do: */
++	} else if (!f) {
++		f = vstruct_last(sb->sb);
++		memset(f, 0, sizeof(u64) * u64s);
++		f->u64s = cpu_to_le32(u64s);
++		f->type = 0;
++	} else {
++		void *src, *dst;
++
++		src = vstruct_end(f);
++
++		if (u64s) {
++			f->u64s = cpu_to_le32(u64s);
++			dst = vstruct_end(f);
++		} else {
++			dst = f;
++		}
++
++		memmove(dst, src, vstruct_end(sb->sb) - src);
++
++		if (dst > src)
++			memset(src, 0, dst - src);
++	}
++
++	sb->sb->u64s = cpu_to_le32(sb_u64s);
++
++	return u64s ? f : NULL;
++}
++
++void bch2_sb_field_delete(struct bch_sb_handle *sb,
++			  enum bch_sb_field_type type)
++{
++	struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
++
++	if (f)
++		__bch2_sb_field_resize(sb, f, 0);
++}
++
++/* Superblock realloc/free: */
++
++void bch2_free_super(struct bch_sb_handle *sb)
++{
++	if (sb->bio)
++		bio_put(sb->bio);
++	if (!IS_ERR_OR_NULL(sb->bdev))
++		blkdev_put(sb->bdev, sb->mode);
++
++	kfree(sb->sb);
++	memset(sb, 0, sizeof(*sb));
++}
++
++int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
++{
++	size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
++	size_t new_buffer_size;
++	struct bch_sb *new_sb;
++	struct bio *bio;
++
++	if (sb->bdev)
++		new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
++
++	new_buffer_size = roundup_pow_of_two(new_bytes);
++
++	if (sb->sb && sb->buffer_size >= new_buffer_size)
++		return 0;
++
++	if (sb->have_layout) {
++		u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
++
++		if (new_bytes > max_bytes) {
++			char buf[BDEVNAME_SIZE];
++
++			pr_err("%s: superblock too big: want %zu but have %llu",
++			       bdevname(sb->bdev, buf), new_bytes, max_bytes);
++			return -ENOSPC;
++		}
++	}
++
++	if (sb->buffer_size >= new_buffer_size && sb->sb)
++		return 0;
++
++	if (dynamic_fault("bcachefs:add:super_realloc"))
++		return -ENOMEM;
++
++	if (sb->have_bio) {
++		bio = bio_kmalloc(GFP_KERNEL,
++			DIV_ROUND_UP(new_buffer_size, PAGE_SIZE));
++		if (!bio)
++			return -ENOMEM;
++
++		if (sb->bio)
++			bio_put(sb->bio);
++		sb->bio = bio;
++	}
++
++	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
++	if (!new_sb)
++		return -ENOMEM;
++
++	sb->sb = new_sb;
++	sb->buffer_size = new_buffer_size;
++
++	return 0;
++}
++
++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
++					  enum bch_sb_field_type type,
++					  unsigned u64s)
++{
++	struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
++	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
++	ssize_t d = -old_u64s + u64s;
++
++	if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
++		return NULL;
++
++	if (sb->fs_sb) {
++		struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
++		struct bch_dev *ca;
++		unsigned i;
++
++		lockdep_assert_held(&c->sb_lock);
++
++		/* XXX: we're not checking that offline device have enough space */
++
++		for_each_online_member(ca, c, i) {
++			struct bch_sb_handle *sb = &ca->disk_sb;
++
++			if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
++				percpu_ref_put(&ca->ref);
++				return NULL;
++			}
++		}
++	}
++
++	f = bch2_sb_field_get(sb->sb, type);
++	f = __bch2_sb_field_resize(sb, f, u64s);
++	if (f)
++		f->type = cpu_to_le32(type);
++	return f;
++}
++
++/* Superblock validate: */
++
++static inline void __bch2_sb_layout_size_assert(void)
++{
++	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
++}
++
++static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
++{
++	u64 offset, prev_offset, max_sectors;
++	unsigned i;
++
++	if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
++		prt_printf(out, "Not a bcachefs superblock layout");
++		return -EINVAL;
++	}
++
++	if (layout->layout_type != 0) {
++		prt_printf(out, "Invalid superblock layout type %u",
++		       layout->layout_type);
++		return -EINVAL;
++	}
++
++	if (!layout->nr_superblocks) {
++		prt_printf(out, "Invalid superblock layout: no superblocks");
++		return -EINVAL;
++	}
++
++	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
++		prt_printf(out, "Invalid superblock layout: too many superblocks");
++		return -EINVAL;
++	}
++
++	max_sectors = 1 << layout->sb_max_size_bits;
++
++	prev_offset = le64_to_cpu(layout->sb_offset[0]);
++
++	for (i = 1; i < layout->nr_superblocks; i++) {
++		offset = le64_to_cpu(layout->sb_offset[i]);
++
++		if (offset < prev_offset + max_sectors) {
++			prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
++			       "  (sb %u ends at %llu next starts at %llu",
++			       i - 1, prev_offset + max_sectors, offset);
++			return -EINVAL;
++		}
++		prev_offset = offset;
++	}
++
++	return 0;
++}
++
++static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
++			    int rw)
++{
++	struct bch_sb *sb = disk_sb->sb;
++	struct bch_sb_field *f;
++	struct bch_sb_field_members *mi;
++	enum bch_opt_id opt_id;
++	u32 version, version_min;
++	u16 block_size;
++	int ret;
++
++	version		= le16_to_cpu(sb->version);
++	version_min	= version >= bcachefs_metadata_version_bkey_renumber
++		? le16_to_cpu(sb->version_min)
++		: version;
++
++	if (version    >= bcachefs_metadata_version_max) {
++		prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
++		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
++		return -EINVAL;
++	}
++
++	if (version_min < bcachefs_metadata_version_min) {
++		prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
++		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
++		return -EINVAL;
++	}
++
++	if (version_min > version) {
++		prt_printf(out, "Bad minimum version %u, greater than version field %u",
++		       version_min, version);
++		return -EINVAL;
++	}
++
++	if (sb->features[1] ||
++	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
++		prt_printf(out, "Filesystem has incompatible features");
++		return -EINVAL;
++	}
++
++	block_size = le16_to_cpu(sb->block_size);
++
++	if (block_size > PAGE_SECTORS) {
++		prt_printf(out, "Block size too big (got %u, max %u)",
++		       block_size, PAGE_SECTORS);
++		return -EINVAL;
++	}
++
++	if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) {
++		prt_printf(out, "Bad user UUID (got zeroes)");
++		return -EINVAL;
++	}
++
++	if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) {
++		prt_printf(out, "Bad intenal UUID (got zeroes)");
++		return -EINVAL;
++	}
++
++	if (!sb->nr_devices ||
++	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
++		prt_printf(out, "Bad number of member devices %u (max %u)",
++		       sb->nr_devices, BCH_SB_MEMBERS_MAX);
++		return -EINVAL;
++	}
++
++	if (sb->dev_idx >= sb->nr_devices) {
++		prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
++		       sb->dev_idx, sb->nr_devices);
++		return -EINVAL;
++	}
++
++	if (!sb->time_precision ||
++	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
++		prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
++		       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
++		return -EINVAL;
++	}
++
++	if (rw == READ) {
++		/*
++		 * Been seeing a bug where these are getting inexplicably
++		 * zeroed, so we'r now validating them, but we have to be
++		 * careful not to preven people's filesystems from mounting:
++		 */
++		if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
++			SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
++		if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
++			SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
++	}
++
++	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
++		const struct bch_option *opt = bch2_opt_table + opt_id;
++
++		if (opt->get_sb != BCH2_NO_SB_OPT) {
++			u64 v = bch2_opt_from_sb(sb, opt_id);
++
++			prt_printf(out, "Invalid option ");
++			ret = bch2_opt_validate(opt, v, out);
++			if (ret)
++				return ret;
++
++			printbuf_reset(out);
++		}
++	}
++
++	/* validate layout */
++	ret = validate_sb_layout(&sb->layout, out);
++	if (ret)
++		return ret;
++
++	vstruct_for_each(sb, f) {
++		if (!f->u64s) {
++			prt_printf(out, "Invalid superblock: optional with size 0 (type %u)",
++			       le32_to_cpu(f->type));
++			return -EINVAL;
++		}
++
++		if (vstruct_next(f) > vstruct_last(sb)) {
++			prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
++			       le32_to_cpu(f->type));
++			return -EINVAL;
++		}
++	}
++
++	/* members must be validated first: */
++	mi = bch2_sb_get_members(sb);
++	if (!mi) {
++		prt_printf(out, "Invalid superblock: member info area missing");
++		return -EINVAL;
++	}
++
++	ret = bch2_sb_field_validate(sb, &mi->field, out);
++	if (ret)
++		return ret;
++
++	vstruct_for_each(sb, f) {
++		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
++			continue;
++
++		ret = bch2_sb_field_validate(sb, f, out);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++/* device open: */
++
++static void bch2_sb_update(struct bch_fs *c)
++{
++	struct bch_sb *src = c->disk_sb.sb;
++	struct bch_sb_field_members *mi = bch2_sb_get_members(src);
++	struct bch_dev *ca;
++	unsigned i;
++
++	lockdep_assert_held(&c->sb_lock);
++
++	c->sb.uuid		= src->uuid;
++	c->sb.user_uuid		= src->user_uuid;
++	c->sb.version		= le16_to_cpu(src->version);
++	c->sb.version_min	= le16_to_cpu(src->version_min);
++	c->sb.nr_devices	= src->nr_devices;
++	c->sb.clean		= BCH_SB_CLEAN(src);
++	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
++
++	c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
++	c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
++
++	/* XXX this is wrong, we need a 96 or 128 bit integer type */
++	c->sb.time_base_lo	= div_u64(le64_to_cpu(src->time_base_lo),
++					  c->sb.nsec_per_time_unit);
++	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
++
++	c->sb.features		= le64_to_cpu(src->features[0]);
++	c->sb.compat		= le64_to_cpu(src->compat[0]);
++
++	for_each_member_device(ca, c, i)
++		ca->mi = bch2_mi_to_cpu(mi->members + i);
++}
++
++static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
++{
++	struct bch_sb_field *src_f, *dst_f;
++	struct bch_sb *dst = dst_handle->sb;
++	unsigned i;
++
++	dst->version		= src->version;
++	dst->version_min	= src->version_min;
++	dst->seq		= src->seq;
++	dst->uuid		= src->uuid;
++	dst->user_uuid		= src->user_uuid;
++	memcpy(dst->label,	src->label, sizeof(dst->label));
++
++	dst->block_size		= src->block_size;
++	dst->nr_devices		= src->nr_devices;
++
++	dst->time_base_lo	= src->time_base_lo;
++	dst->time_base_hi	= src->time_base_hi;
++	dst->time_precision	= src->time_precision;
++
++	memcpy(dst->flags,	src->flags,	sizeof(dst->flags));
++	memcpy(dst->features,	src->features,	sizeof(dst->features));
++	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
++
++	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
++		if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
++			continue;
++
++		src_f = bch2_sb_field_get(src, i);
++		dst_f = bch2_sb_field_get(dst, i);
++		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
++				src_f ? le32_to_cpu(src_f->u64s) : 0);
++
++		if (src_f)
++			memcpy(dst_f, src_f, vstruct_bytes(src_f));
++	}
++}
++
++int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
++{
++	struct bch_sb_field_journal *journal_buckets =
++		bch2_sb_get_journal(src);
++	unsigned journal_u64s = journal_buckets
++		? le32_to_cpu(journal_buckets->field.u64s)
++		: 0;
++	int ret;
++
++	lockdep_assert_held(&c->sb_lock);
++
++	ret = bch2_sb_realloc(&c->disk_sb,
++			      le32_to_cpu(src->u64s) - journal_u64s);
++	if (ret)
++		return ret;
++
++	__copy_super(&c->disk_sb, src);
++
++	ret = bch2_sb_replicas_to_cpu_replicas(c);
++	if (ret)
++		return ret;
++
++	ret = bch2_sb_disk_groups_to_cpu(c);
++	if (ret)
++		return ret;
++
++	bch2_sb_update(c);
++	return 0;
++}
++
++int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
++{
++	struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
++	struct bch_sb_field_journal *journal_buckets =
++		bch2_sb_get_journal(dst);
++	unsigned journal_u64s = journal_buckets
++		? le32_to_cpu(journal_buckets->field.u64s)
++		: 0;
++	unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
++	int ret;
++
++	ret = bch2_sb_realloc(&ca->disk_sb, u64s);
++	if (ret)
++		return ret;
++
++	__copy_super(&ca->disk_sb, src);
++	return 0;
++}
++
++/* read superblock: */
++
++static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
++{
++	struct bch_csum csum;
++	u32 version, version_min;
++	size_t bytes;
++	int ret;
++reread:
++	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
++	sb->bio->bi_iter.bi_sector = offset;
++	bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
++
++	ret = submit_bio_wait(sb->bio);
++	if (ret) {
++		prt_printf(err, "IO error: %i", ret);
++		return ret;
++	}
++
++	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
++		prt_printf(err, "Not a bcachefs superblock");
++		return -EINVAL;
++	}
++
++	version		= le16_to_cpu(sb->sb->version);
++	version_min	= version >= bcachefs_metadata_version_bkey_renumber
++		? le16_to_cpu(sb->sb->version_min)
++		: version;
++
++	if (version    >= bcachefs_metadata_version_max) {
++		prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
++		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
++		return -EINVAL;
++	}
++
++	if (version_min < bcachefs_metadata_version_min) {
++		prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
++		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
++		return -EINVAL;
++	}
++
++	bytes = vstruct_bytes(sb->sb);
++
++	if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
++		prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
++		       bytes, 512UL << sb->sb->layout.sb_max_size_bits);
++		return -EINVAL;
++	}
++
++	if (bytes > sb->buffer_size) {
++		if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
++			return -ENOMEM;
++		goto reread;
++	}
++
++	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
++		prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
++		return -EINVAL;
++	}
++
++	/* XXX: verify MACs */
++	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
++			    null_nonce(), sb->sb);
++
++	if (bch2_crc_cmp(csum, sb->sb->csum)) {
++		prt_printf(err, "bad checksum");
++		return -EINVAL;
++	}
++
++	sb->seq = le64_to_cpu(sb->sb->seq);
++
++	return 0;
++}
++
++int bch2_read_super(const char *path, struct bch_opts *opts,
++		    struct bch_sb_handle *sb)
++{
++	u64 offset = opt_get(*opts, sb);
++	struct bch_sb_layout layout;
++	struct printbuf err = PRINTBUF;
++	__le64 *i;
++	int ret;
++
++	pr_verbose_init(*opts, "");
++
++	memset(sb, 0, sizeof(*sb));
++	sb->mode	= FMODE_READ;
++	sb->have_bio	= true;
++
++	if (!opt_get(*opts, noexcl))
++		sb->mode |= FMODE_EXCL;
++
++	if (!opt_get(*opts, nochanges))
++		sb->mode |= FMODE_WRITE;
++
++	sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
++	if (IS_ERR(sb->bdev) &&
++	    PTR_ERR(sb->bdev) == -EACCES &&
++	    opt_get(*opts, read_only)) {
++		sb->mode &= ~FMODE_WRITE;
++
++		sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
++		if (!IS_ERR(sb->bdev))
++			opt_set(*opts, nochanges, true);
++	}
++
++	if (IS_ERR(sb->bdev)) {
++		ret = PTR_ERR(sb->bdev);
++		goto out;
++	}
++
++	ret = bch2_sb_realloc(sb, 0);
++	if (ret) {
++		prt_printf(&err, "error allocating memory for superblock");
++		goto err;
++	}
++
++	if (bch2_fs_init_fault("read_super")) {
++		prt_printf(&err, "dynamic fault");
++		ret = -EFAULT;
++		goto err;
++	}
++
++	ret = read_one_super(sb, offset, &err);
++	if (!ret)
++		goto got_super;
++
++	if (opt_defined(*opts, sb))
++		goto err;
++
++	printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
++	       path, err.buf);
++	printbuf_reset(&err);
++
++	/*
++	 * Error reading primary superblock - read location of backup
++	 * superblocks:
++	 */
++	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
++	sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
++	/*
++	 * use sb buffer to read layout, since sb buffer is page aligned but
++	 * layout won't be:
++	 */
++	bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
++
++	ret = submit_bio_wait(sb->bio);
++	if (ret) {
++		prt_printf(&err, "IO error: %i", ret);
++		goto err;
++	}
++
++	memcpy(&layout, sb->sb, sizeof(layout));
++	ret = validate_sb_layout(&layout, &err);
++	if (ret)
++		goto err;
++
++	for (i = layout.sb_offset;
++	     i < layout.sb_offset + layout.nr_superblocks; i++) {
++		offset = le64_to_cpu(*i);
++
++		if (offset == opt_get(*opts, sb))
++			continue;
++
++		ret = read_one_super(sb, offset, &err);
++		if (!ret)
++			goto got_super;
++	}
++
++	goto err;
++
++got_super:
++	if (le16_to_cpu(sb->sb->block_size) << 9 <
++	    bdev_logical_block_size(sb->bdev)) {
++		prt_printf(&err, "block size (%u) smaller than device block size (%u)",
++		       le16_to_cpu(sb->sb->block_size) << 9,
++		       bdev_logical_block_size(sb->bdev));
++		ret = -EINVAL;
++		goto err;
++	}
++
++	ret = 0;
++	sb->have_layout = true;
++
++	ret = bch2_sb_validate(sb, &err, READ);
++	if (ret) {
++		printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
++		       path, err.buf);
++		goto err_no_print;
++	}
++out:
++	pr_verbose_init(*opts, "ret %i", ret);
++	printbuf_exit(&err);
++	return ret;
++err:
++	printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
++	       path, err.buf);
++err_no_print:
++	bch2_free_super(sb);
++	goto out;
++}
++
++/* write superblock: */
++
++static void write_super_endio(struct bio *bio)
++{
++	struct bch_dev *ca = bio->bi_private;
++
++	/* XXX: return errors directly */
++
++	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
++			       bch2_blk_status_to_str(bio->bi_status)))
++		ca->sb_write_error = 1;
++
++	closure_put(&ca->fs->sb_write);
++	percpu_ref_put(&ca->io_ref);
++}
++
++static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
++{
++	struct bch_sb *sb = ca->disk_sb.sb;
++	struct bio *bio = ca->disk_sb.bio;
++
++	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
++	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
++	bio->bi_end_io		= write_super_endio;
++	bio->bi_private		= ca;
++	bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
++
++	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
++		     bio_sectors(bio));
++
++	percpu_ref_get(&ca->io_ref);
++	closure_bio_submit(bio, &c->sb_write);
++}
++
++static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
++{
++	struct bch_sb *sb = ca->disk_sb.sb;
++	struct bio *bio = ca->disk_sb.bio;
++
++	sb->offset = sb->layout.sb_offset[idx];
++
++	SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
++	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
++				null_nonce(), sb);
++
++	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
++	bio->bi_iter.bi_sector	= le64_to_cpu(sb->offset);
++	bio->bi_end_io		= write_super_endio;
++	bio->bi_private		= ca;
++	bch2_bio_map(bio, sb,
++		     roundup((size_t) vstruct_bytes(sb),
++			     bdev_logical_block_size(ca->disk_sb.bdev)));
++
++	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
++		     bio_sectors(bio));
++
++	percpu_ref_get(&ca->io_ref);
++	closure_bio_submit(bio, &c->sb_write);
++}
++
++int bch2_write_super(struct bch_fs *c)
++{
++	struct closure *cl = &c->sb_write;
++	struct bch_dev *ca;
++	struct printbuf err = PRINTBUF;
++	unsigned i, sb = 0, nr_wrote;
++	struct bch_devs_mask sb_written;
++	bool wrote, can_mount_without_written, can_mount_with_written;
++	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
++	int ret = 0;
++
++	trace_write_super(c, _RET_IP_);
++
++	if (c->opts.very_degraded)
++		degraded_flags |= BCH_FORCE_IF_LOST;
++
++	lockdep_assert_held(&c->sb_lock);
++
++	closure_init_stack(cl);
++	memset(&sb_written, 0, sizeof(sb_written));
++
++	le64_add_cpu(&c->disk_sb.sb->seq, 1);
++
++	if (test_bit(BCH_FS_ERROR, &c->flags))
++		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
++	if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
++		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
++
++	SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
++
++	bch2_sb_counters_from_cpu(c);
++
++	for_each_online_member(ca, c, i)
++		bch2_sb_from_fs(c, ca);
++
++	for_each_online_member(ca, c, i) {
++		printbuf_reset(&err);
++
++		ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
++		if (ret) {
++			bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
++			percpu_ref_put(&ca->io_ref);
++			goto out;
++		}
++	}
++
++	if (c->opts.nochanges)
++		goto out;
++
++	/*
++	 * Defer writing the superblock until filesystem initialization is
++	 * complete - don't write out a partly initialized superblock:
++	 */
++	if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
++		goto out;
++
++	for_each_online_member(ca, c, i) {
++		__set_bit(ca->dev_idx, sb_written.d);
++		ca->sb_write_error = 0;
++	}
++
++	for_each_online_member(ca, c, i)
++		read_back_super(c, ca);
++	closure_sync(cl);
++
++	for_each_online_member(ca, c, i) {
++		if (ca->sb_write_error)
++			continue;
++
++		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
++			bch2_fs_fatal_error(c,
++				"Superblock write was silently dropped! (seq %llu expected %llu)",
++				le64_to_cpu(ca->sb_read_scratch->seq),
++				ca->disk_sb.seq);
++			percpu_ref_put(&ca->io_ref);
++			ret = -EROFS;
++			goto out;
++		}
++
++		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
++			bch2_fs_fatal_error(c,
++				"Superblock modified by another process (seq %llu expected %llu)",
++				le64_to_cpu(ca->sb_read_scratch->seq),
++				ca->disk_sb.seq);
++			percpu_ref_put(&ca->io_ref);
++			ret = -EROFS;
++			goto out;
++		}
++	}
++
++	do {
++		wrote = false;
++		for_each_online_member(ca, c, i)
++			if (!ca->sb_write_error &&
++			    sb < ca->disk_sb.sb->layout.nr_superblocks) {
++				write_one_super(c, ca, sb);
++				wrote = true;
++			}
++		closure_sync(cl);
++		sb++;
++	} while (wrote);
++
++	for_each_online_member(ca, c, i) {
++		if (ca->sb_write_error)
++			__clear_bit(ca->dev_idx, sb_written.d);
++		else
++			ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
++	}
++
++	nr_wrote = dev_mask_nr(&sb_written);
++
++	can_mount_with_written =
++		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
++
++	for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
++		sb_written.d[i] = ~sb_written.d[i];
++
++	can_mount_without_written =
++		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
++
++	/*
++	 * If we would be able to mount _without_ the devices we successfully
++	 * wrote superblocks to, we weren't able to write to enough devices:
++	 *
++	 * Exception: if we can mount without the successes because we haven't
++	 * written anything (new filesystem), we continue if we'd be able to
++	 * mount with the devices we did successfully write to:
++	 */
++	if (bch2_fs_fatal_err_on(!nr_wrote ||
++				 !can_mount_with_written ||
++				 (can_mount_without_written &&
++				  !can_mount_with_written), c,
++		"Unable to write superblock to sufficient devices (from %ps)",
++		(void *) _RET_IP_))
++		ret = -1;
++out:
++	/* Make new options visible after they're persistent: */
++	bch2_sb_update(c);
++	printbuf_exit(&err);
++	return ret;
++}
++
++void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
++{
++	mutex_lock(&c->sb_lock);
++	if (!(c->sb.features & (1ULL << feat))) {
++		c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
++
++		bch2_write_super(c);
++	}
++	mutex_unlock(&c->sb_lock);
++}
++
++/* BCH_SB_FIELD_members: */
++
++static int bch2_sb_members_validate(struct bch_sb *sb,
++				    struct bch_sb_field *f,
++				    struct printbuf *err)
++{
++	struct bch_sb_field_members *mi = field_to_type(f, members);
++	unsigned i;
++
++	if ((void *) (mi->members + sb->nr_devices) >
++	    vstruct_end(&mi->field)) {
++		prt_printf(err, "too many devices for section size");
++		return -EINVAL;
++	}
++
++	for (i = 0; i < sb->nr_devices; i++) {
++		struct bch_member *m = mi->members + i;
++
++		if (!bch2_member_exists(m))
++			continue;
++
++		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
++			prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
++			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
++			return -EINVAL;
++		}
++
++		if (le64_to_cpu(m->nbuckets) -
++		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
++			prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
++			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
++			return -EINVAL;
++		}
++
++		if (le16_to_cpu(m->bucket_size) <
++		    le16_to_cpu(sb->block_size)) {
++			prt_printf(err, "device %u: bucket size %u smaller than block size %u",
++			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
++			return -EINVAL;
++		}
++
++		if (le16_to_cpu(m->bucket_size) <
++		    BCH_SB_BTREE_NODE_SIZE(sb)) {
++			prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
++			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
++			return -EINVAL;
++		}
++	}
++
++	return 0;
++}
++
++static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
++				    struct bch_sb_field *f)
++{
++	struct bch_sb_field_members *mi = field_to_type(f, members);
++	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
++	unsigned i;
++
++	for (i = 0; i < sb->nr_devices; i++) {
++		struct bch_member *m = mi->members + i;
++		unsigned data_have = bch2_sb_dev_has_data(sb, i);
++		u64 bucket_size = le16_to_cpu(m->bucket_size);
++		u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
++
++		if (!bch2_member_exists(m))
++			continue;
++
++		prt_printf(out, "Device:");
++		prt_tab(out);
++		prt_printf(out, "%u", i);
++		prt_newline(out);
++
++		printbuf_indent_add(out, 2);
++
++		prt_printf(out, "UUID:");
++		prt_tab(out);
++		pr_uuid(out, m->uuid.b);
++		prt_newline(out);
++
++		prt_printf(out, "Size:");
++		prt_tab(out);
++		prt_units_u64(out, device_size << 9);
++		prt_newline(out);
++
++		prt_printf(out, "Bucket size:");
++		prt_tab(out);
++		prt_units_u64(out, bucket_size << 9);
++		prt_newline(out);
++
++		prt_printf(out, "First bucket:");
++		prt_tab(out);
++		prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
++		prt_newline(out);
++
++		prt_printf(out, "Buckets:");
++		prt_tab(out);
++		prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
++		prt_newline(out);
++
++		prt_printf(out, "Last mount:");
++		prt_tab(out);
++		if (m->last_mount)
++			pr_time(out, le64_to_cpu(m->last_mount));
++		else
++			prt_printf(out, "(never)");
++		prt_newline(out);
++
++		prt_printf(out, "State:");
++		prt_tab(out);
++		prt_printf(out, "%s",
++		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
++		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
++		       : "unknown");
++		prt_newline(out);
++
++		prt_printf(out, "Label:");
++		prt_tab(out);
++		if (BCH_MEMBER_GROUP(m)) {
++			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
++
++			if (idx < disk_groups_nr(gi))
++				prt_printf(out, "%s (%u)",
++				       gi->entries[idx].label, idx);
++			else
++				prt_printf(out, "(bad disk labels section)");
++		} else {
++			prt_printf(out, "(none)");
++		}
++		prt_newline(out);
++
++		prt_printf(out, "Data allowed:");
++		prt_tab(out);
++		if (BCH_MEMBER_DATA_ALLOWED(m))
++			prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
++		else
++			prt_printf(out, "(none)");
++		prt_newline(out);
++
++		prt_printf(out, "Has data:");
++		prt_tab(out);
++		if (data_have)
++			prt_bitflags(out, bch2_data_types, data_have);
++		else
++			prt_printf(out, "(none)");
++		prt_newline(out);
++
++		prt_printf(out, "Discard:");
++		prt_tab(out);
++		prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
++		prt_newline(out);
++
++		prt_printf(out, "Freespace initialized:");
++		prt_tab(out);
++		prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
++		prt_newline(out);
++
++		printbuf_indent_sub(out, 2);
++	}
++}
++
++static const struct bch_sb_field_ops bch_sb_field_ops_members = {
++	.validate	= bch2_sb_members_validate,
++	.to_text	= bch2_sb_members_to_text,
++};
++
++/* BCH_SB_FIELD_crypt: */
++
++static int bch2_sb_crypt_validate(struct bch_sb *sb,
++				  struct bch_sb_field *f,
++				  struct printbuf *err)
++{
++	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
++
++	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
++		prt_printf(err, "wrong size (got %zu should be %zu)",
++		       vstruct_bytes(&crypt->field), sizeof(*crypt));
++		return -EINVAL;
++	}
++
++	if (BCH_CRYPT_KDF_TYPE(crypt)) {
++		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
++				  struct bch_sb_field *f)
++{
++	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
++
++	prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
++	prt_newline(out);
++	prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
++	prt_newline(out);
++	prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
++	prt_newline(out);
++	prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
++	prt_newline(out);
++}
++
++static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
++	.validate	= bch2_sb_crypt_validate,
++	.to_text	= bch2_sb_crypt_to_text,
++};
++
++/* BCH_SB_FIELD_clean: */
++
++int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
++{
++	struct jset_entry *entry;
++	int ret;
++
++	for (entry = clean->start;
++	     entry < (struct jset_entry *) vstruct_end(&clean->field);
++	     entry = vstruct_next(entry)) {
++		ret = bch2_journal_entry_validate(c, "superblock", entry,
++						  le16_to_cpu(c->disk_sb.sb->version),
++						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
++						  write);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++int bch2_fs_mark_dirty(struct bch_fs *c)
++{
++	int ret;
++
++	/*
++	 * Unconditionally write superblock, to verify it hasn't changed before
++	 * we go rw:
++	 */
++
++	mutex_lock(&c->sb_lock);
++	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
++	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
++	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
++	ret = bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	return ret;
++}
++
++static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
++{
++	struct jset_entry *entry = *end;
++	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
++
++	memset(entry, 0, u64s * sizeof(u64));
++	/*
++	 * The u64s field counts from the start of data, ignoring the shared
++	 * fields.
++	 */
++	entry->u64s = cpu_to_le16(u64s - 1);
++
++	*end = vstruct_next(*end);
++	return entry;
++}
++
++void bch2_journal_super_entries_add_common(struct bch_fs *c,
++					   struct jset_entry **end,
++					   u64 journal_seq)
++{
++	struct bch_dev *ca;
++	unsigned i, dev;
++
++	percpu_down_read(&c->mark_lock);
++
++	if (!journal_seq) {
++		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
++			bch2_fs_usage_acc_to_base(c, i);
++	} else {
++		bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
++	}
++
++	{
++		struct jset_entry_usage *u =
++			container_of(jset_entry_init(end, sizeof(*u)),
++				     struct jset_entry_usage, entry);
++
++		u->entry.type	= BCH_JSET_ENTRY_usage;
++		u->entry.btree_id = BCH_FS_USAGE_inodes;
++		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
++	}
++
++	{
++		struct jset_entry_usage *u =
++			container_of(jset_entry_init(end, sizeof(*u)),
++				     struct jset_entry_usage, entry);
++
++		u->entry.type	= BCH_JSET_ENTRY_usage;
++		u->entry.btree_id = BCH_FS_USAGE_key_version;
++		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
++	}
++
++	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
++		struct jset_entry_usage *u =
++			container_of(jset_entry_init(end, sizeof(*u)),
++				     struct jset_entry_usage, entry);
++
++		u->entry.type	= BCH_JSET_ENTRY_usage;
++		u->entry.btree_id = BCH_FS_USAGE_reserved;
++		u->entry.level	= i;
++		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
++	}
++
++	for (i = 0; i < c->replicas.nr; i++) {
++		struct bch_replicas_entry *e =
++			cpu_replicas_entry(&c->replicas, i);
++		struct jset_entry_data_usage *u =
++			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
++				     struct jset_entry_data_usage, entry);
++
++		u->entry.type	= BCH_JSET_ENTRY_data_usage;
++		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
++		memcpy(&u->r, e, replicas_entry_bytes(e));
++	}
++
++	for_each_member_device(ca, c, dev) {
++		unsigned b = sizeof(struct jset_entry_dev_usage) +
++			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
++		struct jset_entry_dev_usage *u =
++			container_of(jset_entry_init(end, b),
++				     struct jset_entry_dev_usage, entry);
++
++		u->entry.type = BCH_JSET_ENTRY_dev_usage;
++		u->dev = cpu_to_le32(dev);
++		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
++
++		for (i = 0; i < BCH_DATA_NR; i++) {
++			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
++			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
++			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
++		}
++	}
++
++	percpu_up_read(&c->mark_lock);
++
++	for (i = 0; i < 2; i++) {
++		struct jset_entry_clock *clock =
++			container_of(jset_entry_init(end, sizeof(*clock)),
++				     struct jset_entry_clock, entry);
++
++		clock->entry.type = BCH_JSET_ENTRY_clock;
++		clock->rw	= i;
++		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
++	}
++}
++
++void bch2_fs_mark_clean(struct bch_fs *c)
++{
++	struct bch_sb_field_clean *sb_clean;
++	struct jset_entry *entry;
++	unsigned u64s;
++	int ret;
++
++	mutex_lock(&c->sb_lock);
++	if (BCH_SB_CLEAN(c->disk_sb.sb))
++		goto out;
++
++	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
++
++	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
++	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
++	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
++	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
++
++	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
++
++	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
++	if (!sb_clean) {
++		bch_err(c, "error resizing superblock while setting filesystem clean");
++		goto out;
++	}
++
++	sb_clean->flags		= 0;
++	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
++
++	/* Trying to catch outstanding bug: */
++	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
++
++	entry = sb_clean->start;
++	bch2_journal_super_entries_add_common(c, &entry, 0);
++	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
++	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
++
++	memset(entry, 0,
++	       vstruct_end(&sb_clean->field) - (void *) entry);
++
++	/*
++	 * this should be in the write path, and we should be validating every
++	 * superblock section:
++	 */
++	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
++	if (ret) {
++		bch_err(c, "error writing marking filesystem clean: validate error");
++		goto out;
++	}
++
++	bch2_write_super(c);
++out:
++	mutex_unlock(&c->sb_lock);
++}
++
++static int bch2_sb_clean_validate(struct bch_sb *sb,
++				  struct bch_sb_field *f,
++				  struct printbuf *err)
++{
++	struct bch_sb_field_clean *clean = field_to_type(f, clean);
++
++	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
++		prt_printf(err, "wrong size (got %zu should be %zu)",
++		       vstruct_bytes(&clean->field), sizeof(*clean));
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
++				  struct bch_sb_field *f)
++{
++	struct bch_sb_field_clean *clean = field_to_type(f, clean);
++	struct jset_entry *entry;
++
++	prt_printf(out, "flags:          %x",	le32_to_cpu(clean->flags));
++	prt_newline(out);
++	prt_printf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
++	prt_newline(out);
++
++	for (entry = clean->start;
++	     entry != vstruct_end(&clean->field);
++	     entry = vstruct_next(entry)) {
++		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
++		    !entry->u64s)
++			continue;
++
++		bch2_journal_entry_to_text(out, NULL, entry);
++		prt_newline(out);
++	}
++}
++
++static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
++	.validate	= bch2_sb_clean_validate,
++	.to_text	= bch2_sb_clean_to_text,
++};
++
++static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
++#define x(f, nr)					\
++	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
++	BCH_SB_FIELDS()
++#undef x
++};
++
++static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
++				  struct printbuf *err)
++{
++	unsigned type = le32_to_cpu(f->type);
++	struct printbuf field_err = PRINTBUF;
++	int ret;
++
++	if (type >= BCH_SB_FIELD_NR)
++		return 0;
++
++	ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
++	if (ret) {
++		prt_printf(err, "Invalid superblock section %s: %s",
++		       bch2_sb_fields[type],
++		       field_err.buf);
++		prt_newline(err);
++		bch2_sb_field_to_text(err, sb, f);
++	}
++
++	printbuf_exit(&field_err);
++	return ret;
++}
++
++void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
++			   struct bch_sb_field *f)
++{
++	unsigned type = le32_to_cpu(f->type);
++	const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
++		? bch2_sb_field_ops[type] : NULL;
++
++	if (!out->tabstops[0])
++		out->tabstops[0] = 32;
++
++	if (ops)
++		prt_printf(out, "%s", bch2_sb_fields[type]);
++	else
++		prt_printf(out, "(unknown field %u)", type);
++
++	prt_printf(out, " (size %zu):", vstruct_bytes(f));
++	prt_newline(out);
++
++	if (ops && ops->to_text) {
++		printbuf_indent_add(out, 2);
++		bch2_sb_field_ops[type]->to_text(out, sb, f);
++		printbuf_indent_sub(out, 2);
++	}
++}
++
++void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
++{
++	unsigned i;
++
++	prt_printf(out, "Type:                    %u", l->layout_type);
++	prt_newline(out);
++
++	prt_str(out, "Superblock max size:     ");
++	prt_units_u64(out, 512 << l->sb_max_size_bits);
++	prt_newline(out);
++
++	prt_printf(out, "Nr superblocks:          %u", l->nr_superblocks);
++	prt_newline(out);
++
++	prt_str(out, "Offsets:                 ");
++	for (i = 0; i < l->nr_superblocks; i++) {
++		if (i)
++			prt_str(out, ", ");
++		prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
++	}
++	prt_newline(out);
++}
++
++void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
++		     bool print_layout, unsigned fields)
++{
++	struct bch_sb_field_members *mi;
++	struct bch_sb_field *f;
++	u64 fields_have = 0;
++	unsigned nr_devices = 0;
++
++	if (!out->tabstops[0])
++		out->tabstops[0] = 32;
++
++	mi = bch2_sb_get_members(sb);
++	if (mi) {
++		struct bch_member *m;
++
++		for (m = mi->members;
++		     m < mi->members + sb->nr_devices;
++		     m++)
++			nr_devices += bch2_member_exists(m);
++	}
++
++	prt_printf(out, "External UUID:");
++	prt_tab(out);
++	pr_uuid(out, sb->user_uuid.b);
++	prt_newline(out);
++
++	prt_printf(out, "Internal UUID:");
++	prt_tab(out);
++	pr_uuid(out, sb->uuid.b);
++	prt_newline(out);
++
++	prt_str(out, "Device index:");
++	prt_tab(out);
++	prt_printf(out, "%u", sb->dev_idx);
++	prt_newline(out);
++
++	prt_str(out, "Label:");
++	prt_tab(out);
++	prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
++	prt_newline(out);
++
++	prt_str(out, "Version:");
++	prt_tab(out);
++	prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
++	prt_newline(out);
++
++	prt_printf(out, "Oldest version on disk:");
++	prt_tab(out);
++	prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
++	prt_newline(out);
++
++	prt_printf(out, "Created:");
++	prt_tab(out);
++	if (sb->time_base_lo)
++		pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
++	else
++		prt_printf(out, "(not set)");
++	prt_newline(out);
++
++	prt_printf(out, "Sequence number:");
++	prt_tab(out);
++	prt_printf(out, "%llu", le64_to_cpu(sb->seq));
++	prt_newline(out);
++
++	prt_printf(out, "Superblock size:");
++	prt_tab(out);
++	prt_printf(out, "%zu", vstruct_bytes(sb));
++	prt_newline(out);
++
++	prt_printf(out, "Clean:");
++	prt_tab(out);
++	prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
++	prt_newline(out);
++
++	prt_printf(out, "Devices:");
++	prt_tab(out);
++	prt_printf(out, "%u", nr_devices);
++	prt_newline(out);
++
++	prt_printf(out, "Sections:");
++	vstruct_for_each(sb, f)
++		fields_have |= 1 << le32_to_cpu(f->type);
++	prt_tab(out);
++	prt_bitflags(out, bch2_sb_fields, fields_have);
++	prt_newline(out);
++
++	prt_printf(out, "Features:");
++	prt_tab(out);
++	prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
++	prt_newline(out);
++
++	prt_printf(out, "Compat features:");
++	prt_tab(out);
++	prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
++	prt_newline(out);
++
++	prt_newline(out);
++	prt_printf(out, "Options:");
++	prt_newline(out);
++	printbuf_indent_add(out, 2);
++	{
++		enum bch_opt_id id;
++
++		for (id = 0; id < bch2_opts_nr; id++) {
++			const struct bch_option *opt = bch2_opt_table + id;
++
++			if (opt->get_sb != BCH2_NO_SB_OPT) {
++				u64 v = bch2_opt_from_sb(sb, id);
++
++				prt_printf(out, "%s:", opt->attr.name);
++				prt_tab(out);
++				bch2_opt_to_text(out, NULL, sb, opt, v,
++						 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
++				prt_newline(out);
++			}
++		}
++	}
++
++	printbuf_indent_sub(out, 2);
++
++	if (print_layout) {
++		prt_newline(out);
++		prt_printf(out, "layout:");
++		prt_newline(out);
++		printbuf_indent_add(out, 2);
++		bch2_sb_layout_to_text(out, &sb->layout);
++		printbuf_indent_sub(out, 2);
++	}
++
++	vstruct_for_each(sb, f)
++		if (fields & (1 << le32_to_cpu(f->type))) {
++			prt_newline(out);
++			bch2_sb_field_to_text(out, sb, f);
++		}
++}
+diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
+new file mode 100644
+index 000000000000..14a25f6fe29a
+--- /dev/null
++++ b/fs/bcachefs/super-io.h
+@@ -0,0 +1,126 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_SUPER_IO_H
++#define _BCACHEFS_SUPER_IO_H
++
++#include "extents.h"
++#include "eytzinger.h"
++#include "super_types.h"
++#include "super.h"
++
++#include <asm/byteorder.h>
++
++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
++					  enum bch_sb_field_type, unsigned);
++void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
++
++#define field_to_type(_f, _name)					\
++	container_of_or_null(_f, struct bch_sb_field_##_name, field)
++
++#define x(_name, _nr)							\
++static inline struct bch_sb_field_##_name *				\
++bch2_sb_get_##_name(struct bch_sb *sb)					\
++{									\
++	return field_to_type(bch2_sb_field_get(sb,			\
++				BCH_SB_FIELD_##_name), _name);		\
++}									\
++									\
++static inline struct bch_sb_field_##_name *				\
++bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s)	\
++{									\
++	return field_to_type(bch2_sb_field_resize(sb,			\
++				BCH_SB_FIELD_##_name, u64s), _name);	\
++}
++
++BCH_SB_FIELDS()
++#undef x
++
++extern const char * const bch2_sb_fields[];
++
++struct bch_sb_field_ops {
++	int	(*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
++	void	(*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
++};
++
++static inline __le64 bch2_sb_magic(struct bch_fs *c)
++{
++	__le64 ret;
++	memcpy(&ret, &c->sb.uuid, sizeof(ret));
++	return ret;
++}
++
++static inline __u64 jset_magic(struct bch_fs *c)
++{
++	return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
++}
++
++static inline __u64 bset_magic(struct bch_fs *c)
++{
++	return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
++}
++
++int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
++int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
++
++void bch2_free_super(struct bch_sb_handle *);
++int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
++
++int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
++int bch2_write_super(struct bch_fs *);
++void __bch2_check_set_feature(struct bch_fs *, unsigned);
++
++static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
++{
++	if (!(c->sb.features & (1ULL << feat)))
++		__bch2_check_set_feature(c, feat);
++}
++
++/* BCH_SB_FIELD_members: */
++
++static inline bool bch2_member_exists(struct bch_member *m)
++{
++	return !bch2_is_zero(m->uuid.b, sizeof(uuid_le));
++}
++
++static inline bool bch2_dev_exists(struct bch_sb *sb,
++				   struct bch_sb_field_members *mi,
++				   unsigned dev)
++{
++	return dev < sb->nr_devices &&
++		bch2_member_exists(&mi->members[dev]);
++}
++
++static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
++{
++	return (struct bch_member_cpu) {
++		.nbuckets	= le64_to_cpu(mi->nbuckets),
++		.first_bucket	= le16_to_cpu(mi->first_bucket),
++		.bucket_size	= le16_to_cpu(mi->bucket_size),
++		.group		= BCH_MEMBER_GROUP(mi),
++		.state		= BCH_MEMBER_STATE(mi),
++		.discard	= BCH_MEMBER_DISCARD(mi),
++		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
++		.durability	= BCH_MEMBER_DURABILITY(mi)
++			? BCH_MEMBER_DURABILITY(mi) - 1
++			: 1,
++		.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
++		.valid		= !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
++	};
++}
++
++/* BCH_SB_FIELD_clean: */
++
++void bch2_journal_super_entries_add_common(struct bch_fs *,
++					   struct jset_entry **, u64);
++
++int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
++
++int bch2_fs_mark_dirty(struct bch_fs *);
++void bch2_fs_mark_clean(struct bch_fs *);
++
++void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
++			   struct bch_sb_field *);
++void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
++void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
++
++#endif /* _BCACHEFS_SUPER_IO_H */
+diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
+new file mode 100644
+index 000000000000..7c6348001ae3
+--- /dev/null
++++ b/fs/bcachefs/super.c
+@@ -0,0 +1,1950 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * bcachefs setup/teardown code, and some metadata io - read a superblock and
++ * figure out what to do with it.
++ *
++ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
++ * Copyright 2012 Google, Inc.
++ */
++
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "alloc_foreground.h"
++#include "bkey_sort.h"
++#include "btree_cache.h"
++#include "btree_gc.h"
++#include "btree_key_cache.h"
++#include "btree_update_interior.h"
++#include "btree_io.h"
++#include "buckets_waiting_for_journal.h"
++#include "chardev.h"
++#include "checksum.h"
++#include "clock.h"
++#include "compress.h"
++#include "debug.h"
++#include "disk_groups.h"
++#include "ec.h"
++#include "errcode.h"
++#include "error.h"
++#include "fs.h"
++#include "fs-io.h"
++#include "fsck.h"
++#include "inode.h"
++#include "io.h"
++#include "journal.h"
++#include "journal_reclaim.h"
++#include "journal_seq_blacklist.h"
++#include "move.h"
++#include "migrate.h"
++#include "movinggc.h"
++#include "quota.h"
++#include "rebalance.h"
++#include "recovery.h"
++#include "replicas.h"
++#include "subvolume.h"
++#include "super.h"
++#include "super-io.h"
++#include "sysfs.h"
++#include "counters.h"
++
++#include <linux/backing-dev.h>
++#include <linux/blkdev.h>
++#include <linux/debugfs.h>
++#include <linux/device.h>
++#include <linux/idr.h>
++#include <linux/module.h>
++#include <linux/percpu.h>
++#include <linux/pretty-printers.h>
++#include <linux/random.h>
++#include <linux/sysfs.h>
++#include <crypto/hash.h>
++
++#include <trace/events/bcachefs.h>
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
++
++#define KTYPE(type)							\
++static const struct attribute_group type ## _group = {			\
++	.attrs = type ## _files						\
++};									\
++									\
++static const struct attribute_group *type ## _groups[] = {		\
++	&type ## _group,						\
++	NULL								\
++};									\
++									\
++static const struct kobj_type type ## _ktype = {			\
++	.release	= type ## _release,				\
++	.sysfs_ops	= &type ## _sysfs_ops,				\
++	.default_groups = type ## _groups				\
++}
++
++static void bch2_fs_release(struct kobject *);
++static void bch2_dev_release(struct kobject *);
++static void bch2_fs_counters_release(struct kobject *k)
++{
++}
++
++static void bch2_fs_internal_release(struct kobject *k)
++{
++}
++
++static void bch2_fs_opts_dir_release(struct kobject *k)
++{
++}
++
++static void bch2_fs_time_stats_release(struct kobject *k)
++{
++}
++
++KTYPE(bch2_fs);
++KTYPE(bch2_fs_counters);
++KTYPE(bch2_fs_internal);
++KTYPE(bch2_fs_opts_dir);
++KTYPE(bch2_fs_time_stats);
++KTYPE(bch2_dev);
++
++static struct kset *bcachefs_kset;
++static LIST_HEAD(bch_fs_list);
++static DEFINE_MUTEX(bch_fs_list_lock);
++
++static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
++
++static void bch2_dev_free(struct bch_dev *);
++static int bch2_dev_alloc(struct bch_fs *, unsigned);
++static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
++static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
++
++struct bch_fs *bch2_dev_to_fs(dev_t dev)
++{
++	struct bch_fs *c;
++	struct bch_dev *ca;
++	unsigned i;
++
++	mutex_lock(&bch_fs_list_lock);
++	rcu_read_lock();
++
++	list_for_each_entry(c, &bch_fs_list, list)
++		for_each_member_device_rcu(ca, c, i, NULL)
++			if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
++				closure_get(&c->cl);
++				goto found;
++			}
++	c = NULL;
++found:
++	rcu_read_unlock();
++	mutex_unlock(&bch_fs_list_lock);
++
++	return c;
++}
++
++static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid)
++{
++	struct bch_fs *c;
++
++	lockdep_assert_held(&bch_fs_list_lock);
++
++	list_for_each_entry(c, &bch_fs_list, list)
++		if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le)))
++			return c;
++
++	return NULL;
++}
++
++struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
++{
++	struct bch_fs *c;
++
++	mutex_lock(&bch_fs_list_lock);
++	c = __bch2_uuid_to_fs(uuid);
++	if (c)
++		closure_get(&c->cl);
++	mutex_unlock(&bch_fs_list_lock);
++
++	return c;
++}
++
++static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned i, nr = 0, u64s =
++		((sizeof(struct jset_entry_dev_usage) +
++		  sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
++		sizeof(u64);
++
++	rcu_read_lock();
++	for_each_member_device_rcu(ca, c, i, NULL)
++		nr++;
++	rcu_read_unlock();
++
++	bch2_journal_entry_res_resize(&c->journal,
++			&c->dev_usage_journal_res, u64s * nr);
++}
++
++/* Filesystem RO/RW: */
++
++/*
++ * For startup/shutdown of RW stuff, the dependencies are:
++ *
++ * - foreground writes depend on copygc and rebalance (to free up space)
++ *
++ * - copygc and rebalance depend on mark and sweep gc (they actually probably
++ *   don't because they either reserve ahead of time or don't block if
++ *   allocations fail, but allocations can require mark and sweep gc to run
++ *   because of generation number wraparound)
++ *
++ * - all of the above depends on the allocator threads
++ *
++ * - allocator depends on the journal (when it rewrites prios and gens)
++ */
++
++static void __bch2_fs_read_only(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned i, clean_passes = 0;
++	u64 seq = 0;
++
++	bch2_rebalance_stop(c);
++	bch2_copygc_stop(c);
++	bch2_gc_thread_stop(c);
++
++	bch_verbose(c, "flushing journal and stopping allocators");
++
++	do {
++		clean_passes++;
++
++		if (bch2_btree_interior_updates_flush(c) ||
++		    bch2_journal_flush_all_pins(&c->journal) ||
++		    bch2_btree_flush_all_writes(c) ||
++		    seq != atomic64_read(&c->journal.seq)) {
++			seq = atomic64_read(&c->journal.seq);
++			clean_passes = 0;
++		}
++	} while (clean_passes < 2);
++
++	bch_verbose(c, "flushing journal and stopping allocators complete");
++
++	if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
++	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
++		set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
++	bch2_fs_journal_stop(&c->journal);
++
++	/*
++	 * After stopping journal:
++	 */
++	for_each_member_device(ca, c, i)
++		bch2_dev_allocator_remove(c, ca);
++}
++
++static void bch2_writes_disabled(struct percpu_ref *writes)
++{
++	struct bch_fs *c = container_of(writes, struct bch_fs, writes);
++
++	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
++	wake_up(&bch_read_only_wait);
++}
++
++void bch2_fs_read_only(struct bch_fs *c)
++{
++	if (!test_bit(BCH_FS_RW, &c->flags)) {
++		bch2_journal_reclaim_stop(&c->journal);
++		return;
++	}
++
++	BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
++
++	/*
++	 * Block new foreground-end write operations from starting - any new
++	 * writes will return -EROFS:
++	 */
++	percpu_ref_kill(&c->writes);
++
++	cancel_work_sync(&c->ec_stripe_delete_work);
++
++	/*
++	 * If we're not doing an emergency shutdown, we want to wait on
++	 * outstanding writes to complete so they don't see spurious errors due
++	 * to shutting down the allocator:
++	 *
++	 * If we are doing an emergency shutdown outstanding writes may
++	 * hang until we shutdown the allocator so we don't want to wait
++	 * on outstanding writes before shutting everything down - but
++	 * we do need to wait on them before returning and signalling
++	 * that going RO is complete:
++	 */
++	wait_event(bch_read_only_wait,
++		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
++		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
++
++	__bch2_fs_read_only(c);
++
++	wait_event(bch_read_only_wait,
++		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
++
++	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
++
++	if (!bch2_journal_error(&c->journal) &&
++	    !test_bit(BCH_FS_ERROR, &c->flags) &&
++	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
++	    test_bit(BCH_FS_STARTED, &c->flags) &&
++	    test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
++	    !c->opts.norecovery) {
++		bch_verbose(c, "marking filesystem clean");
++		bch2_fs_mark_clean(c);
++	}
++
++	clear_bit(BCH_FS_RW, &c->flags);
++}
++
++static void bch2_fs_read_only_work(struct work_struct *work)
++{
++	struct bch_fs *c =
++		container_of(work, struct bch_fs, read_only_work);
++
++	down_write(&c->state_lock);
++	bch2_fs_read_only(c);
++	up_write(&c->state_lock);
++}
++
++static void bch2_fs_read_only_async(struct bch_fs *c)
++{
++	queue_work(system_long_wq, &c->read_only_work);
++}
++
++bool bch2_fs_emergency_read_only(struct bch_fs *c)
++{
++	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
++
++	bch2_journal_halt(&c->journal);
++	bch2_fs_read_only_async(c);
++
++	wake_up(&bch_read_only_wait);
++	return ret;
++}
++
++static int bch2_fs_read_write_late(struct bch_fs *c)
++{
++	int ret;
++
++	ret = bch2_gc_thread_start(c);
++	if (ret) {
++		bch_err(c, "error starting gc thread");
++		return ret;
++	}
++
++	ret = bch2_copygc_start(c);
++	if (ret) {
++		bch_err(c, "error starting copygc thread");
++		return ret;
++	}
++
++	ret = bch2_rebalance_start(c);
++	if (ret) {
++		bch_err(c, "error starting rebalance thread");
++		return ret;
++	}
++
++	schedule_work(&c->ec_stripe_delete_work);
++
++	return 0;
++}
++
++static int __bch2_fs_read_write(struct bch_fs *c, bool early)
++{
++	struct bch_dev *ca;
++	unsigned i;
++	int ret;
++
++	if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
++		bch_err(c, "cannot go rw, unfixed btree errors");
++		return -EROFS;
++	}
++
++	if (test_bit(BCH_FS_RW, &c->flags))
++		return 0;
++
++	/*
++	 * nochanges is used for fsck -n mode - we have to allow going rw
++	 * during recovery for that to work:
++	 */
++	if (c->opts.norecovery ||
++	    (c->opts.nochanges &&
++	     (!early || c->opts.read_only)))
++		return -EROFS;
++
++	bch_info(c, "going read-write");
++
++	ret = bch2_fs_mark_dirty(c);
++	if (ret)
++		goto err;
++
++	clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
++
++	for_each_rw_member(ca, c, i)
++		bch2_dev_allocator_add(c, ca);
++	bch2_recalc_capacity(c);
++
++	bch2_do_discards(c);
++	bch2_do_invalidates(c);
++
++	if (!early) {
++		ret = bch2_fs_read_write_late(c);
++		if (ret)
++			goto err;
++	}
++
++	percpu_ref_reinit(&c->writes);
++	set_bit(BCH_FS_RW, &c->flags);
++	set_bit(BCH_FS_WAS_RW, &c->flags);
++	return 0;
++err:
++	__bch2_fs_read_only(c);
++	return ret;
++}
++
++int bch2_fs_read_write(struct bch_fs *c)
++{
++	return __bch2_fs_read_write(c, false);
++}
++
++int bch2_fs_read_write_early(struct bch_fs *c)
++{
++	lockdep_assert_held(&c->state_lock);
++
++	return __bch2_fs_read_write(c, true);
++}
++
++/* Filesystem startup/shutdown: */
++
++static void __bch2_fs_free(struct bch_fs *c)
++{
++	unsigned i;
++	int cpu;
++
++	for (i = 0; i < BCH_TIME_STAT_NR; i++)
++		bch2_time_stats_exit(&c->times[i]);
++
++	bch2_fs_counters_exit(c);
++	bch2_fs_snapshots_exit(c);
++	bch2_fs_quota_exit(c);
++	bch2_fs_fsio_exit(c);
++	bch2_fs_ec_exit(c);
++	bch2_fs_encryption_exit(c);
++	bch2_fs_io_exit(c);
++	bch2_fs_buckets_waiting_for_journal_exit(c);
++	bch2_fs_btree_interior_update_exit(c);
++	bch2_fs_btree_iter_exit(c);
++	bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
++	bch2_fs_btree_cache_exit(c);
++	bch2_fs_replicas_exit(c);
++	bch2_fs_journal_exit(&c->journal);
++	bch2_io_clock_exit(&c->io_clock[WRITE]);
++	bch2_io_clock_exit(&c->io_clock[READ]);
++	bch2_fs_compress_exit(c);
++	bch2_journal_keys_free(&c->journal_keys);
++	bch2_journal_entries_free(c);
++	percpu_free_rwsem(&c->mark_lock);
++
++	if (c->btree_paths_bufs)
++		for_each_possible_cpu(cpu)
++			kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
++
++	free_percpu(c->online_reserved);
++	free_percpu(c->btree_paths_bufs);
++	free_percpu(c->pcpu);
++	mempool_exit(&c->large_bkey_pool);
++	mempool_exit(&c->btree_bounce_pool);
++	bioset_exit(&c->btree_bio);
++	mempool_exit(&c->fill_iter);
++	percpu_ref_exit(&c->writes);
++	kfree(rcu_dereference_protected(c->disk_groups, 1));
++	kfree(c->journal_seq_blacklist_table);
++	kfree(c->unused_inode_hints);
++	free_heap(&c->copygc_heap);
++
++	if (c->io_complete_wq )
++		destroy_workqueue(c->io_complete_wq );
++	if (c->copygc_wq)
++		destroy_workqueue(c->copygc_wq);
++	if (c->btree_io_complete_wq)
++		destroy_workqueue(c->btree_io_complete_wq);
++	if (c->btree_update_wq)
++		destroy_workqueue(c->btree_update_wq);
++
++	bch2_free_super(&c->disk_sb);
++	kvpfree(c, sizeof(*c));
++	module_put(THIS_MODULE);
++}
++
++static void bch2_fs_release(struct kobject *kobj)
++{
++	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
++
++	__bch2_fs_free(c);
++}
++
++void __bch2_fs_stop(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned i;
++
++	bch_verbose(c, "shutting down");
++
++	set_bit(BCH_FS_STOPPING, &c->flags);
++
++	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
++
++	down_write(&c->state_lock);
++	bch2_fs_read_only(c);
++	up_write(&c->state_lock);
++
++	for_each_member_device(ca, c, i)
++		if (ca->kobj.state_in_sysfs &&
++		    ca->disk_sb.bdev)
++			sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
++
++	if (c->kobj.state_in_sysfs)
++		kobject_del(&c->kobj);
++
++	bch2_fs_debug_exit(c);
++	bch2_fs_chardev_exit(c);
++
++	kobject_put(&c->counters_kobj);
++	kobject_put(&c->time_stats);
++	kobject_put(&c->opts_dir);
++	kobject_put(&c->internal);
++
++	/* btree prefetch might have kicked off reads in the background: */
++	bch2_btree_flush_all_reads(c);
++
++	for_each_member_device(ca, c, i)
++		cancel_work_sync(&ca->io_error_work);
++
++	cancel_work_sync(&c->read_only_work);
++
++	for (i = 0; i < c->sb.nr_devices; i++)
++		if (c->devs[i])
++			bch2_free_super(&c->devs[i]->disk_sb);
++}
++
++void bch2_fs_free(struct bch_fs *c)
++{
++	unsigned i;
++
++	mutex_lock(&bch_fs_list_lock);
++	list_del(&c->list);
++	mutex_unlock(&bch_fs_list_lock);
++
++	closure_sync(&c->cl);
++	closure_debug_destroy(&c->cl);
++
++	for (i = 0; i < c->sb.nr_devices; i++)
++		if (c->devs[i])
++			bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
++
++	bch_verbose(c, "shutdown complete");
++
++	kobject_put(&c->kobj);
++}
++
++void bch2_fs_stop(struct bch_fs *c)
++{
++	__bch2_fs_stop(c);
++	bch2_fs_free(c);
++}
++
++static int bch2_fs_online(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned i;
++	int ret = 0;
++
++	lockdep_assert_held(&bch_fs_list_lock);
++
++	if (__bch2_uuid_to_fs(c->sb.uuid)) {
++		bch_err(c, "filesystem UUID already open");
++		return -EINVAL;
++	}
++
++	ret = bch2_fs_chardev_init(c);
++	if (ret) {
++		bch_err(c, "error creating character device");
++		return ret;
++	}
++
++	bch2_fs_debug_init(c);
++
++	ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
++	    kobject_add(&c->internal, &c->kobj, "internal") ?:
++	    kobject_add(&c->opts_dir, &c->kobj, "options") ?:
++	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
++	    kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
++	    bch2_opts_create_sysfs_files(&c->opts_dir);
++	if (ret) {
++		bch_err(c, "error creating sysfs objects");
++		return ret;
++	}
++
++	down_write(&c->state_lock);
++
++	for_each_member_device(ca, c, i) {
++		ret = bch2_dev_sysfs_online(c, ca);
++		if (ret) {
++			bch_err(c, "error creating sysfs objects");
++			percpu_ref_put(&ca->ref);
++			goto err;
++		}
++	}
++
++	BUG_ON(!list_empty(&c->list));
++	list_add(&c->list, &bch_fs_list);
++err:
++	up_write(&c->state_lock);
++	return ret;
++}
++
++static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
++{
++	struct bch_sb_field_members *mi;
++	struct bch_fs *c;
++	struct printbuf name = PRINTBUF;
++	unsigned i, iter_size;
++	int ret = 0;
++
++	pr_verbose_init(opts, "");
++
++	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
++	if (!c) {
++		c = ERR_PTR(-ENOMEM);
++		goto out;
++	}
++
++	__module_get(THIS_MODULE);
++
++	closure_init(&c->cl, NULL);
++
++	c->kobj.kset = bcachefs_kset;
++	kobject_init(&c->kobj, &bch2_fs_ktype);
++	kobject_init(&c->internal, &bch2_fs_internal_ktype);
++	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
++	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
++	kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
++
++	c->minor		= -1;
++	c->disk_sb.fs_sb	= true;
++
++	init_rwsem(&c->state_lock);
++	mutex_init(&c->sb_lock);
++	mutex_init(&c->replicas_gc_lock);
++	mutex_init(&c->btree_root_lock);
++	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
++
++	init_rwsem(&c->gc_lock);
++	mutex_init(&c->gc_gens_lock);
++
++	for (i = 0; i < BCH_TIME_STAT_NR; i++)
++		bch2_time_stats_init(&c->times[i]);
++
++	bch2_fs_copygc_init(c);
++	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
++	bch2_fs_allocator_background_init(c);
++	bch2_fs_allocator_foreground_init(c);
++	bch2_fs_rebalance_init(c);
++	bch2_fs_quota_init(c);
++	bch2_fs_ec_init_early(c);
++
++	INIT_LIST_HEAD(&c->list);
++
++	mutex_init(&c->usage_scratch_lock);
++
++	mutex_init(&c->bio_bounce_pages_lock);
++	mutex_init(&c->snapshot_table_lock);
++
++	spin_lock_init(&c->btree_write_error_lock);
++
++	INIT_WORK(&c->journal_seq_blacklist_gc_work,
++		  bch2_blacklist_entries_gc);
++
++	INIT_LIST_HEAD(&c->journal_iters);
++
++	INIT_LIST_HEAD(&c->fsck_errors);
++	mutex_init(&c->fsck_error_lock);
++
++	INIT_LIST_HEAD(&c->ec_stripe_head_list);
++	mutex_init(&c->ec_stripe_head_lock);
++
++	INIT_LIST_HEAD(&c->ec_stripe_new_list);
++	mutex_init(&c->ec_stripe_new_lock);
++
++	INIT_LIST_HEAD(&c->data_progress_list);
++	mutex_init(&c->data_progress_lock);
++
++	spin_lock_init(&c->ec_stripes_heap_lock);
++
++	seqcount_init(&c->gc_pos_lock);
++
++	seqcount_init(&c->usage_lock);
++
++	sema_init(&c->io_in_flight, 64);
++
++	c->copy_gc_enabled		= 1;
++	c->rebalance.enabled		= 1;
++	c->promote_whole_extents	= true;
++
++	c->journal.flush_write_time	= &c->times[BCH_TIME_journal_flush_write];
++	c->journal.noflush_write_time	= &c->times[BCH_TIME_journal_noflush_write];
++	c->journal.blocked_time		= &c->times[BCH_TIME_blocked_journal];
++	c->journal.flush_seq_time	= &c->times[BCH_TIME_journal_flush_seq];
++
++	bch2_fs_btree_cache_init_early(&c->btree_cache);
++
++	mutex_init(&c->sectors_available_lock);
++
++	ret = percpu_init_rwsem(&c->mark_lock);
++	if (ret)
++		goto err;
++
++	mutex_lock(&c->sb_lock);
++	ret = bch2_sb_to_fs(c, sb);
++	mutex_unlock(&c->sb_lock);
++
++	if (ret)
++		goto err;
++
++	pr_uuid(&name, c->sb.user_uuid.b);
++	strlcpy(c->name, name.buf, sizeof(c->name));
++	printbuf_exit(&name);
++
++	ret = name.allocation_failure ? -ENOMEM : 0;
++	if (ret)
++		goto err;
++
++	/* Compat: */
++	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
++	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
++		SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
++
++	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
++	    !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
++		SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
++
++	c->opts = bch2_opts_default;
++	ret = bch2_opts_from_sb(&c->opts, sb);
++	if (ret)
++		goto err;
++
++	bch2_opts_apply(&c->opts, opts);
++
++	/* key cache currently disabled for inodes, because of snapshots: */
++	c->opts.inodes_use_key_cache = 0;
++
++	c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
++	if (c->opts.inodes_use_key_cache)
++		c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
++
++	c->block_bits		= ilog2(block_sectors(c));
++	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
++
++	if (bch2_fs_init_fault("fs_alloc")) {
++		bch_err(c, "fs_alloc fault injected");
++		ret = -EFAULT;
++		goto err;
++	}
++
++	iter_size = sizeof(struct sort_iter) +
++		(btree_blocks(c) + 1) * 2 *
++		sizeof(struct sort_iter_set);
++
++	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
++
++	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
++				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
++	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
++				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
++	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
++				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
++	    !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
++				WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
++	    percpu_ref_init(&c->writes, bch2_writes_disabled,
++			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
++	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
++	    bioset_init(&c->btree_bio, 1,
++			max(offsetof(struct btree_read_bio, bio),
++			    offsetof(struct btree_write_bio, wbio.bio)),
++			BIOSET_NEED_BVECS) ||
++	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
++	    !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
++	    !(c->online_reserved = alloc_percpu(u64)) ||
++	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
++					btree_bytes(c)) ||
++	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
++	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
++					      sizeof(u64), GFP_KERNEL))) {
++		ret = -ENOMEM;
++		goto err;
++	}
++
++	ret = bch2_io_clock_init(&c->io_clock[READ]) ?:
++	    bch2_io_clock_init(&c->io_clock[WRITE]) ?:
++	    bch2_fs_journal_init(&c->journal) ?:
++	    bch2_fs_replicas_init(c) ?:
++	    bch2_fs_btree_cache_init(c) ?:
++	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
++	    bch2_fs_btree_iter_init(c) ?:
++	    bch2_fs_btree_interior_update_init(c) ?:
++	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
++	    bch2_fs_subvolumes_init(c) ?:
++	    bch2_fs_io_init(c) ?:
++	    bch2_fs_encryption_init(c) ?:
++	    bch2_fs_compress_init(c) ?:
++	    bch2_fs_ec_init(c) ?:
++	    bch2_fs_fsio_init(c) ?:
++	    bch2_fs_counters_init(c);
++	if (ret)
++		goto err;
++
++	mi = bch2_sb_get_members(c->disk_sb.sb);
++	for (i = 0; i < c->sb.nr_devices; i++)
++		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
++		    bch2_dev_alloc(c, i)) {
++			ret = -EEXIST;
++			goto err;
++		}
++
++	bch2_journal_entry_res_resize(&c->journal,
++			&c->btree_root_journal_res,
++			BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
++	bch2_dev_usage_journal_reserve(c);
++	bch2_journal_entry_res_resize(&c->journal,
++			&c->clock_journal_res,
++			(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
++
++	mutex_lock(&bch_fs_list_lock);
++	ret = bch2_fs_online(c);
++	mutex_unlock(&bch_fs_list_lock);
++
++	if (ret)
++		goto err;
++out:
++	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
++	return c;
++err:
++	bch2_fs_free(c);
++	c = ERR_PTR(ret);
++	goto out;
++}
++
++noinline_for_stack
++static void print_mount_opts(struct bch_fs *c)
++{
++	enum bch_opt_id i;
++	struct printbuf p = PRINTBUF;
++	bool first = true;
++
++	if (c->opts.read_only) {
++		prt_printf(&p, "ro");
++		first = false;
++	}
++
++	for (i = 0; i < bch2_opts_nr; i++) {
++		const struct bch_option *opt = &bch2_opt_table[i];
++		u64 v = bch2_opt_get_by_id(&c->opts, i);
++
++		if (!(opt->flags & OPT_MOUNT))
++			continue;
++
++		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
++			continue;
++
++		if (!first)
++			prt_printf(&p, ",");
++		first = false;
++		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
++	}
++
++	if (!p.pos)
++		prt_printf(&p, "(null)");
++
++	bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
++	printbuf_exit(&p);
++}
++
++int bch2_fs_start(struct bch_fs *c)
++{
++	struct bch_sb_field_members *mi;
++	struct bch_dev *ca;
++	time64_t now = ktime_get_real_seconds();
++	unsigned i;
++	int ret = -EINVAL;
++
++	down_write(&c->state_lock);
++
++	BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
++
++	mutex_lock(&c->sb_lock);
++
++	for_each_online_member(ca, c, i)
++		bch2_sb_from_fs(c, ca);
++
++	mi = bch2_sb_get_members(c->disk_sb.sb);
++	for_each_online_member(ca, c, i)
++		mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
++
++	mutex_unlock(&c->sb_lock);
++
++	for_each_rw_member(ca, c, i)
++		bch2_dev_allocator_add(c, ca);
++	bch2_recalc_capacity(c);
++
++	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
++		? bch2_fs_recovery(c)
++		: bch2_fs_initialize(c);
++	if (ret)
++		goto err;
++
++	ret = bch2_opts_check_may_set(c);
++	if (ret)
++		goto err;
++
++	ret = -EINVAL;
++	if (bch2_fs_init_fault("fs_start")) {
++		bch_err(c, "fs_start fault injected");
++		goto err;
++	}
++
++	set_bit(BCH_FS_STARTED, &c->flags);
++
++	if (c->opts.read_only || c->opts.nochanges) {
++		bch2_fs_read_only(c);
++	} else {
++		ret = !test_bit(BCH_FS_RW, &c->flags)
++			? bch2_fs_read_write(c)
++			: bch2_fs_read_write_late(c);
++		if (ret)
++			goto err;
++	}
++
++	print_mount_opts(c);
++	ret = 0;
++out:
++	up_write(&c->state_lock);
++	return ret;
++err:
++	bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
++
++	if (ret < -BCH_ERR_START)
++		ret = -EINVAL;
++	goto out;
++}
++
++static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
++{
++	struct bch_sb_field_members *sb_mi;
++
++	sb_mi = bch2_sb_get_members(sb);
++	if (!sb_mi)
++		return "Invalid superblock: member info area missing";
++
++	if (le16_to_cpu(sb->block_size) != block_sectors(c))
++		return "mismatched block size";
++
++	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
++	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
++		return "new cache bucket size is too small";
++
++	return NULL;
++}
++
++static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
++{
++	struct bch_sb *newest =
++		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
++	struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
++
++	if (uuid_le_cmp(fs->uuid, sb->uuid))
++		return "device not a member of filesystem";
++
++	if (!bch2_dev_exists(newest, mi, sb->dev_idx))
++		return "device has been removed";
++
++	if (fs->block_size != sb->block_size)
++		return "mismatched block size";
++
++	return NULL;
++}
++
++/* Device startup/shutdown: */
++
++static void bch2_dev_release(struct kobject *kobj)
++{
++	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
++
++	kfree(ca);
++}
++
++static void bch2_dev_free(struct bch_dev *ca)
++{
++	cancel_work_sync(&ca->io_error_work);
++
++	if (ca->kobj.state_in_sysfs &&
++	    ca->disk_sb.bdev)
++		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
++
++	if (ca->kobj.state_in_sysfs)
++		kobject_del(&ca->kobj);
++
++	bch2_free_super(&ca->disk_sb);
++	bch2_dev_journal_exit(ca);
++
++	free_percpu(ca->io_done);
++	bioset_exit(&ca->replica_set);
++	bch2_dev_buckets_free(ca);
++	free_page((unsigned long) ca->sb_read_scratch);
++
++	bch2_time_stats_exit(&ca->io_latency[WRITE]);
++	bch2_time_stats_exit(&ca->io_latency[READ]);
++
++	percpu_ref_exit(&ca->io_ref);
++	percpu_ref_exit(&ca->ref);
++	kobject_put(&ca->kobj);
++}
++
++static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
++{
++
++	lockdep_assert_held(&c->state_lock);
++
++	if (percpu_ref_is_zero(&ca->io_ref))
++		return;
++
++	__bch2_dev_read_only(c, ca);
++
++	reinit_completion(&ca->io_ref_completion);
++	percpu_ref_kill(&ca->io_ref);
++	wait_for_completion(&ca->io_ref_completion);
++
++	if (ca->kobj.state_in_sysfs) {
++		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
++		sysfs_remove_link(&ca->kobj, "block");
++	}
++
++	bch2_free_super(&ca->disk_sb);
++	bch2_dev_journal_exit(ca);
++}
++
++static void bch2_dev_ref_complete(struct percpu_ref *ref)
++{
++	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
++
++	complete(&ca->ref_completion);
++}
++
++static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
++{
++	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
++
++	complete(&ca->io_ref_completion);
++}
++
++static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
++{
++	int ret;
++
++	if (!c->kobj.state_in_sysfs)
++		return 0;
++
++	if (!ca->kobj.state_in_sysfs) {
++		ret = kobject_add(&ca->kobj, &c->kobj,
++				  "dev-%u", ca->dev_idx);
++		if (ret)
++			return ret;
++	}
++
++	if (ca->disk_sb.bdev) {
++		struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
++
++		ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
++		if (ret)
++			return ret;
++
++		ret = sysfs_create_link(&ca->kobj, block, "block");
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
++					struct bch_member *member)
++{
++	struct bch_dev *ca;
++
++	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
++	if (!ca)
++		return NULL;
++
++	kobject_init(&ca->kobj, &bch2_dev_ktype);
++	init_completion(&ca->ref_completion);
++	init_completion(&ca->io_ref_completion);
++
++	init_rwsem(&ca->bucket_lock);
++
++	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
++
++	bch2_time_stats_init(&ca->io_latency[READ]);
++	bch2_time_stats_init(&ca->io_latency[WRITE]);
++
++	ca->mi = bch2_mi_to_cpu(member);
++	ca->uuid = member->uuid;
++
++	ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
++			     ca->mi.bucket_size / btree_sectors(c));
++
++	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
++			    0, GFP_KERNEL) ||
++	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
++			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
++	    !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
++	    bch2_dev_buckets_alloc(c, ca) ||
++	    bioset_init(&ca->replica_set, 4,
++			offsetof(struct bch_write_bio, bio), 0) ||
++	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
++		goto err;
++
++	return ca;
++err:
++	bch2_dev_free(ca);
++	return NULL;
++}
++
++static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
++			    unsigned dev_idx)
++{
++	ca->dev_idx = dev_idx;
++	__set_bit(ca->dev_idx, ca->self.d);
++	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
++
++	ca->fs = c;
++	rcu_assign_pointer(c->devs[ca->dev_idx], ca);
++
++	if (bch2_dev_sysfs_online(c, ca))
++		pr_warn("error creating sysfs objects");
++}
++
++static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
++{
++	struct bch_member *member =
++		bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
++	struct bch_dev *ca = NULL;
++	int ret = 0;
++
++	pr_verbose_init(c->opts, "");
++
++	if (bch2_fs_init_fault("dev_alloc"))
++		goto err;
++
++	ca = __bch2_dev_alloc(c, member);
++	if (!ca)
++		goto err;
++
++	ca->fs = c;
++
++	bch2_dev_attach(c, ca, dev_idx);
++out:
++	pr_verbose_init(c->opts, "ret %i", ret);
++	return ret;
++err:
++	if (ca)
++		bch2_dev_free(ca);
++	ret = -ENOMEM;
++	goto out;
++}
++
++static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
++{
++	unsigned ret;
++
++	if (bch2_dev_is_online(ca)) {
++		bch_err(ca, "already have device online in slot %u",
++			sb->sb->dev_idx);
++		return -EINVAL;
++	}
++
++	if (get_capacity(sb->bdev->bd_disk) <
++	    ca->mi.bucket_size * ca->mi.nbuckets) {
++		bch_err(ca, "cannot online: device too small");
++		return -EINVAL;
++	}
++
++	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
++
++	if (get_capacity(sb->bdev->bd_disk) <
++	    ca->mi.bucket_size * ca->mi.nbuckets) {
++		bch_err(ca, "device too small");
++		return -EINVAL;
++	}
++
++	ret = bch2_dev_journal_init(ca, sb->sb);
++	if (ret)
++		return ret;
++
++	/* Commit: */
++	ca->disk_sb = *sb;
++	if (sb->mode & FMODE_EXCL)
++		ca->disk_sb.bdev->bd_holder = ca;
++	memset(sb, 0, sizeof(*sb));
++
++	ca->dev = ca->disk_sb.bdev->bd_dev;
++
++	percpu_ref_reinit(&ca->io_ref);
++
++	return 0;
++}
++
++static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
++{
++	struct bch_dev *ca;
++	int ret;
++
++	lockdep_assert_held(&c->state_lock);
++
++	if (le64_to_cpu(sb->sb->seq) >
++	    le64_to_cpu(c->disk_sb.sb->seq))
++		bch2_sb_to_fs(c, sb->sb);
++
++	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
++	       !c->devs[sb->sb->dev_idx]);
++
++	ca = bch_dev_locked(c, sb->sb->dev_idx);
++
++	ret = __bch2_dev_attach_bdev(ca, sb);
++	if (ret)
++		return ret;
++
++	bch2_dev_sysfs_online(c, ca);
++
++	if (c->sb.nr_devices == 1)
++		bdevname(ca->disk_sb.bdev, c->name);
++	bdevname(ca->disk_sb.bdev, ca->name);
++
++	rebalance_wakeup(c);
++	return 0;
++}
++
++/* Device management: */
++
++/*
++ * Note: this function is also used by the error paths - when a particular
++ * device sees an error, we call it to determine whether we can just set the
++ * device RO, or - if this function returns false - we'll set the whole
++ * filesystem RO:
++ *
++ * XXX: maybe we should be more explicit about whether we're changing state
++ * because we got an error or what have you?
++ */
++bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
++			    enum bch_member_state new_state, int flags)
++{
++	struct bch_devs_mask new_online_devs;
++	struct bch_dev *ca2;
++	int i, nr_rw = 0, required;
++
++	lockdep_assert_held(&c->state_lock);
++
++	switch (new_state) {
++	case BCH_MEMBER_STATE_rw:
++		return true;
++	case BCH_MEMBER_STATE_ro:
++		if (ca->mi.state != BCH_MEMBER_STATE_rw)
++			return true;
++
++		/* do we have enough devices to write to?  */
++		for_each_member_device(ca2, c, i)
++			if (ca2 != ca)
++				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
++
++		required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
++			       ? c->opts.metadata_replicas
++			       : c->opts.metadata_replicas_required,
++			       !(flags & BCH_FORCE_IF_DATA_DEGRADED)
++			       ? c->opts.data_replicas
++			       : c->opts.data_replicas_required);
++
++		return nr_rw >= required;
++	case BCH_MEMBER_STATE_failed:
++	case BCH_MEMBER_STATE_spare:
++		if (ca->mi.state != BCH_MEMBER_STATE_rw &&
++		    ca->mi.state != BCH_MEMBER_STATE_ro)
++			return true;
++
++		/* do we have enough devices to read from?  */
++		new_online_devs = bch2_online_devs(c);
++		__clear_bit(ca->dev_idx, new_online_devs.d);
++
++		return bch2_have_enough_devs(c, new_online_devs, flags, false);
++	default:
++		BUG();
++	}
++}
++
++static bool bch2_fs_may_start(struct bch_fs *c)
++{
++	struct bch_sb_field_members *mi;
++	struct bch_dev *ca;
++	unsigned i, flags = 0;
++
++	if (c->opts.very_degraded)
++		flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
++
++	if (c->opts.degraded)
++		flags |= BCH_FORCE_IF_DEGRADED;
++
++	if (!c->opts.degraded &&
++	    !c->opts.very_degraded) {
++		mutex_lock(&c->sb_lock);
++		mi = bch2_sb_get_members(c->disk_sb.sb);
++
++		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
++			if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
++				continue;
++
++			ca = bch_dev_locked(c, i);
++
++			if (!bch2_dev_is_online(ca) &&
++			    (ca->mi.state == BCH_MEMBER_STATE_rw ||
++			     ca->mi.state == BCH_MEMBER_STATE_ro)) {
++				mutex_unlock(&c->sb_lock);
++				return false;
++			}
++		}
++		mutex_unlock(&c->sb_lock);
++	}
++
++	return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
++}
++
++static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
++{
++	/*
++	 * Device going read only means the copygc reserve get smaller, so we
++	 * don't want that happening while copygc is in progress:
++	 */
++	bch2_copygc_stop(c);
++
++	/*
++	 * The allocator thread itself allocates btree nodes, so stop it first:
++	 */
++	bch2_dev_allocator_remove(c, ca);
++	bch2_dev_journal_stop(&c->journal, ca);
++
++	bch2_copygc_start(c);
++}
++
++static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
++{
++	lockdep_assert_held(&c->state_lock);
++
++	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
++
++	bch2_dev_allocator_add(c, ca);
++	bch2_recalc_capacity(c);
++}
++
++int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
++			 enum bch_member_state new_state, int flags)
++{
++	struct bch_sb_field_members *mi;
++	int ret = 0;
++
++	if (ca->mi.state == new_state)
++		return 0;
++
++	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
++		return -EINVAL;
++
++	if (new_state != BCH_MEMBER_STATE_rw)
++		__bch2_dev_read_only(c, ca);
++
++	bch_notice(ca, "%s", bch2_member_states[new_state]);
++
++	mutex_lock(&c->sb_lock);
++	mi = bch2_sb_get_members(c->disk_sb.sb);
++	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	if (new_state == BCH_MEMBER_STATE_rw)
++		__bch2_dev_read_write(c, ca);
++
++	rebalance_wakeup(c);
++
++	return ret;
++}
++
++int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
++		       enum bch_member_state new_state, int flags)
++{
++	int ret;
++
++	down_write(&c->state_lock);
++	ret = __bch2_dev_set_state(c, ca, new_state, flags);
++	up_write(&c->state_lock);
++
++	return ret;
++}
++
++/* Device add/removal: */
++
++static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
++{
++	struct bpos start	= POS(ca->dev_idx, 0);
++	struct bpos end		= POS(ca->dev_idx, U64_MAX);
++	int ret;
++
++	/*
++	 * We clear the LRU and need_discard btrees first so that we don't race
++	 * with bch2_do_invalidates() and bch2_do_discards()
++	 */
++	ret =   bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
++					BTREE_TRIGGER_NORUN, NULL) ?:
++		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
++					BTREE_TRIGGER_NORUN, NULL) ?:
++		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
++					BTREE_TRIGGER_NORUN, NULL) ?:
++		bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
++					BTREE_TRIGGER_NORUN, NULL) ?:
++		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
++					BTREE_TRIGGER_NORUN, NULL);
++	if (ret)
++		bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
++
++	return ret;
++}
++
++int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
++{
++	struct bch_sb_field_members *mi;
++	unsigned dev_idx = ca->dev_idx, data;
++	int ret = -EINVAL;
++
++	down_write(&c->state_lock);
++
++	/*
++	 * We consume a reference to ca->ref, regardless of whether we succeed
++	 * or fail:
++	 */
++	percpu_ref_put(&ca->ref);
++
++	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
++		bch_err(ca, "Cannot remove without losing data");
++		goto err;
++	}
++
++	__bch2_dev_read_only(c, ca);
++
++	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
++	if (ret) {
++		bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	ret = bch2_dev_remove_alloc(c, ca);
++	if (ret) {
++		bch_err(ca, "Remove failed, error deleting alloc info");
++		goto err;
++	}
++
++	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
++	if (ret) {
++		bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	ret = bch2_journal_flush(&c->journal);
++	if (ret) {
++		bch_err(ca, "Remove failed, journal error");
++		goto err;
++	}
++
++	ret = bch2_replicas_gc2(c);
++	if (ret) {
++		bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	data = bch2_dev_has_data(c, ca);
++	if (data) {
++		struct printbuf data_has = PRINTBUF;
++
++		prt_bitflags(&data_has, bch2_data_types, data);
++		bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
++		printbuf_exit(&data_has);
++		ret = -EBUSY;
++		goto err;
++	}
++
++	__bch2_dev_offline(c, ca);
++
++	mutex_lock(&c->sb_lock);
++	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
++	mutex_unlock(&c->sb_lock);
++
++	percpu_ref_kill(&ca->ref);
++	wait_for_completion(&ca->ref_completion);
++
++	bch2_dev_free(ca);
++
++	/*
++	 * Free this device's slot in the bch_member array - all pointers to
++	 * this device must be gone:
++	 */
++	mutex_lock(&c->sb_lock);
++	mi = bch2_sb_get_members(c->disk_sb.sb);
++	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
++
++	bch2_write_super(c);
++
++	mutex_unlock(&c->sb_lock);
++	up_write(&c->state_lock);
++
++	bch2_dev_usage_journal_reserve(c);
++	return 0;
++err:
++	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
++	    !percpu_ref_is_zero(&ca->io_ref))
++		__bch2_dev_read_write(c, ca);
++	up_write(&c->state_lock);
++	return ret;
++}
++
++/* Add new device to running filesystem: */
++int bch2_dev_add(struct bch_fs *c, const char *path)
++{
++	struct bch_opts opts = bch2_opts_empty();
++	struct bch_sb_handle sb;
++	const char *err;
++	struct bch_dev *ca = NULL;
++	struct bch_sb_field_members *mi;
++	struct bch_member dev_mi;
++	unsigned dev_idx, nr_devices, u64s;
++	struct printbuf errbuf = PRINTBUF;
++	int ret;
++
++	ret = bch2_read_super(path, &opts, &sb);
++	if (ret) {
++		bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
++
++	err = bch2_dev_may_add(sb.sb, c);
++	if (err) {
++		bch_err(c, "device add error: %s", err);
++		ret = -EINVAL;
++		goto err;
++	}
++
++	ca = __bch2_dev_alloc(c, &dev_mi);
++	if (!ca) {
++		bch2_free_super(&sb);
++		ret = -ENOMEM;
++		goto err;
++	}
++
++	bch2_dev_usage_init(ca);
++
++	ret = __bch2_dev_attach_bdev(ca, &sb);
++	if (ret) {
++		bch2_dev_free(ca);
++		goto err;
++	}
++
++	ret = bch2_dev_journal_alloc(ca);
++	if (ret) {
++		bch_err(c, "device add error: journal alloc failed");
++		goto err;
++	}
++
++	down_write(&c->state_lock);
++	mutex_lock(&c->sb_lock);
++
++	ret = bch2_sb_from_fs(c, ca);
++	if (ret) {
++		bch_err(c, "device add error: new device superblock too small");
++		goto err_unlock;
++	}
++
++	mi = bch2_sb_get_members(ca->disk_sb.sb);
++
++	if (!bch2_sb_resize_members(&ca->disk_sb,
++				le32_to_cpu(mi->field.u64s) +
++				sizeof(dev_mi) / sizeof(u64))) {
++		bch_err(c, "device add error: new device superblock too small");
++		ret = -ENOSPC;
++		goto err_unlock;
++	}
++
++	if (dynamic_fault("bcachefs:add:no_slot"))
++		goto no_slot;
++
++	mi = bch2_sb_get_members(c->disk_sb.sb);
++	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
++		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
++			goto have_slot;
++no_slot:
++	bch_err(c, "device add error: already have maximum number of devices");
++	ret = -ENOSPC;
++	goto err_unlock;
++
++have_slot:
++	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
++	u64s = (sizeof(struct bch_sb_field_members) +
++		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
++
++	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
++	if (!mi) {
++		bch_err(c, "device add error: no room in superblock for member info");
++		ret = -ENOSPC;
++		goto err_unlock;
++	}
++
++	/* success: */
++
++	mi->members[dev_idx] = dev_mi;
++	mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds());
++	c->disk_sb.sb->nr_devices	= nr_devices;
++
++	ca->disk_sb.sb->dev_idx	= dev_idx;
++	bch2_dev_attach(c, ca, dev_idx);
++
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	bch2_dev_usage_journal_reserve(c);
++
++	ret = bch2_trans_mark_dev_sb(c, ca);
++	if (ret) {
++		bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
++		goto err_late;
++	}
++
++	ret = bch2_fs_freespace_init(c);
++	if (ret) {
++		bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
++		goto err_late;
++	}
++
++	ca->new_fs_bucket_idx = 0;
++
++	if (ca->mi.state == BCH_MEMBER_STATE_rw)
++		__bch2_dev_read_write(c, ca);
++
++	up_write(&c->state_lock);
++	return 0;
++
++err_unlock:
++	mutex_unlock(&c->sb_lock);
++	up_write(&c->state_lock);
++err:
++	if (ca)
++		bch2_dev_free(ca);
++	bch2_free_super(&sb);
++	printbuf_exit(&errbuf);
++	return ret;
++err_late:
++	up_write(&c->state_lock);
++	ca = NULL;
++	goto err;
++}
++
++/* Hot add existing device to running filesystem: */
++int bch2_dev_online(struct bch_fs *c, const char *path)
++{
++	struct bch_opts opts = bch2_opts_empty();
++	struct bch_sb_handle sb = { NULL };
++	struct bch_sb_field_members *mi;
++	struct bch_dev *ca;
++	unsigned dev_idx;
++	const char *err;
++	int ret;
++
++	down_write(&c->state_lock);
++
++	ret = bch2_read_super(path, &opts, &sb);
++	if (ret) {
++		up_write(&c->state_lock);
++		return ret;
++	}
++
++	dev_idx = sb.sb->dev_idx;
++
++	err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
++	if (err) {
++		bch_err(c, "error bringing %s online: %s", path, err);
++		goto err;
++	}
++
++	ret = bch2_dev_attach_bdev(c, &sb);
++	if (ret)
++		goto err;
++
++	ca = bch_dev_locked(c, dev_idx);
++
++	ret = bch2_trans_mark_dev_sb(c, ca);
++	if (ret) {
++		bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
++			path, bch2_err_str(ret));
++		goto err;
++	}
++
++	if (ca->mi.state == BCH_MEMBER_STATE_rw)
++		__bch2_dev_read_write(c, ca);
++
++	mutex_lock(&c->sb_lock);
++	mi = bch2_sb_get_members(c->disk_sb.sb);
++
++	mi->members[ca->dev_idx].last_mount =
++		cpu_to_le64(ktime_get_real_seconds());
++
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	up_write(&c->state_lock);
++	return 0;
++err:
++	up_write(&c->state_lock);
++	bch2_free_super(&sb);
++	return -EINVAL;
++}
++
++int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
++{
++	down_write(&c->state_lock);
++
++	if (!bch2_dev_is_online(ca)) {
++		bch_err(ca, "Already offline");
++		up_write(&c->state_lock);
++		return 0;
++	}
++
++	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
++		bch_err(ca, "Cannot offline required disk");
++		up_write(&c->state_lock);
++		return -EINVAL;
++	}
++
++	__bch2_dev_offline(c, ca);
++
++	up_write(&c->state_lock);
++	return 0;
++}
++
++int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
++{
++	struct bch_member *mi;
++	int ret = 0;
++
++	down_write(&c->state_lock);
++
++	if (nbuckets < ca->mi.nbuckets) {
++		bch_err(ca, "Cannot shrink yet");
++		ret = -EINVAL;
++		goto err;
++	}
++
++	if (bch2_dev_is_online(ca) &&
++	    get_capacity(ca->disk_sb.bdev->bd_disk) <
++	    ca->mi.bucket_size * nbuckets) {
++		bch_err(ca, "New size larger than device");
++		ret = -EINVAL;
++		goto err;
++	}
++
++	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
++	if (ret) {
++		bch_err(ca, "Resize error: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	ret = bch2_trans_mark_dev_sb(c, ca);
++	if (ret) {
++		goto err;
++	}
++
++	mutex_lock(&c->sb_lock);
++	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
++	mi->nbuckets = cpu_to_le64(nbuckets);
++
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	bch2_recalc_capacity(c);
++err:
++	up_write(&c->state_lock);
++	return ret;
++}
++
++/* return with ref on ca->ref: */
++struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
++{
++	struct bch_dev *ca;
++	unsigned i;
++
++	rcu_read_lock();
++	for_each_member_device_rcu(ca, c, i, NULL)
++		if (!strcmp(name, ca->name))
++			goto found;
++	ca = ERR_PTR(-ENOENT);
++found:
++	rcu_read_unlock();
++
++	return ca;
++}
++
++/* Filesystem open: */
++
++struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
++			    struct bch_opts opts)
++{
++	struct bch_sb_handle *sb = NULL;
++	struct bch_fs *c = NULL;
++	struct bch_sb_field_members *mi;
++	unsigned i, best_sb = 0;
++	const char *err;
++	struct printbuf errbuf = PRINTBUF;
++	int ret = 0;
++
++	if (!try_module_get(THIS_MODULE))
++		return ERR_PTR(-ENODEV);
++
++	pr_verbose_init(opts, "");
++
++	if (!nr_devices) {
++		ret = -EINVAL;
++		goto err;
++	}
++
++	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
++	if (!sb) {
++		ret = -ENOMEM;
++		goto err;
++	}
++
++	for (i = 0; i < nr_devices; i++) {
++		ret = bch2_read_super(devices[i], &opts, &sb[i]);
++		if (ret)
++			goto err;
++
++	}
++
++	for (i = 1; i < nr_devices; i++)
++		if (le64_to_cpu(sb[i].sb->seq) >
++		    le64_to_cpu(sb[best_sb].sb->seq))
++			best_sb = i;
++
++	mi = bch2_sb_get_members(sb[best_sb].sb);
++
++	i = 0;
++	while (i < nr_devices) {
++		if (i != best_sb &&
++		    !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
++			char buf[BDEVNAME_SIZE];
++			pr_info("%s has been removed, skipping",
++				bdevname(sb[i].bdev, buf));
++			bch2_free_super(&sb[i]);
++			array_remove_item(sb, nr_devices, i);
++			continue;
++		}
++
++		err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
++		if (err)
++			goto err_print;
++		i++;
++	}
++
++	c = bch2_fs_alloc(sb[best_sb].sb, opts);
++	if (IS_ERR(c)) {
++		ret = PTR_ERR(c);
++		goto err;
++	}
++
++	down_write(&c->state_lock);
++	for (i = 0; i < nr_devices; i++) {
++		ret = bch2_dev_attach_bdev(c, &sb[i]);
++		if (ret) {
++			up_write(&c->state_lock);
++			goto err;
++		}
++	}
++	up_write(&c->state_lock);
++
++	err = "insufficient devices";
++	if (!bch2_fs_may_start(c))
++		goto err_print;
++
++	if (!c->opts.nostart) {
++		ret = bch2_fs_start(c);
++		if (ret)
++			goto err;
++	}
++out:
++	kfree(sb);
++	printbuf_exit(&errbuf);
++	module_put(THIS_MODULE);
++	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
++	return c;
++err_print:
++	pr_err("bch_fs_open err opening %s: %s",
++	       devices[0], err);
++	ret = -EINVAL;
++err:
++	if (!IS_ERR_OR_NULL(c))
++		bch2_fs_stop(c);
++	if (sb)
++		for (i = 0; i < nr_devices; i++)
++			bch2_free_super(&sb[i]);
++	c = ERR_PTR(ret);
++	goto out;
++}
++
++/* Global interfaces/init */
++
++static void bcachefs_exit(void)
++{
++	bch2_debug_exit();
++	bch2_vfs_exit();
++	bch2_chardev_exit();
++	bch2_btree_key_cache_exit();
++	if (bcachefs_kset)
++		kset_unregister(bcachefs_kset);
++}
++
++static int __init bcachefs_init(void)
++{
++	bch2_bkey_pack_test();
++
++	if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
++	    bch2_btree_key_cache_init() ||
++	    bch2_chardev_init() ||
++	    bch2_vfs_init() ||
++	    bch2_debug_init())
++		goto err;
++
++	return 0;
++err:
++	bcachefs_exit();
++	return -ENOMEM;
++}
++
++#define BCH_DEBUG_PARAM(name, description)			\
++	bool bch2_##name;					\
++	module_param_named(name, bch2_##name, bool, 0644);	\
++	MODULE_PARM_DESC(name, description);
++BCH_DEBUG_PARAMS()
++#undef BCH_DEBUG_PARAM
++
++module_exit(bcachefs_exit);
++module_init(bcachefs_init);
+diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
+new file mode 100644
+index 000000000000..8501adaff4c2
+--- /dev/null
++++ b/fs/bcachefs/super.h
+@@ -0,0 +1,264 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_SUPER_H
++#define _BCACHEFS_SUPER_H
++
++#include "extents.h"
++
++#include "bcachefs_ioctl.h"
++
++#include <linux/math64.h>
++
++static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
++{
++	return div_u64(s, ca->mi.bucket_size);
++}
++
++static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
++{
++	return ((sector_t) b) * ca->mi.bucket_size;
++}
++
++static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
++{
++	u32 remainder;
++
++	div_u64_rem(s, ca->mi.bucket_size, &remainder);
++	return remainder;
++}
++
++static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
++						 u32 *offset)
++{
++	return div_u64_rem(s, ca->mi.bucket_size, offset);
++}
++
++static inline bool bch2_dev_is_online(struct bch_dev *ca)
++{
++	return !percpu_ref_is_zero(&ca->io_ref);
++}
++
++static inline bool bch2_dev_is_readable(struct bch_dev *ca)
++{
++	return bch2_dev_is_online(ca) &&
++		ca->mi.state != BCH_MEMBER_STATE_failed;
++}
++
++static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
++{
++	if (!percpu_ref_tryget(&ca->io_ref))
++		return false;
++
++	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
++	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
++		return true;
++
++	percpu_ref_put(&ca->io_ref);
++	return false;
++}
++
++static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
++{
++	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
++}
++
++static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
++					 unsigned dev)
++{
++	unsigned i;
++
++	for (i = 0; i < devs.nr; i++)
++		if (devs.devs[i] == dev)
++			return true;
++
++	return false;
++}
++
++static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
++					  unsigned dev)
++{
++	unsigned i;
++
++	for (i = 0; i < devs->nr; i++)
++		if (devs->devs[i] == dev) {
++			array_remove_item(devs->devs, devs->nr, i);
++			return;
++		}
++}
++
++static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
++					 unsigned dev)
++{
++	BUG_ON(bch2_dev_list_has_dev(*devs, dev));
++	BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
++	devs->devs[devs->nr++] = dev;
++}
++
++static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
++{
++	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
++}
++
++static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
++					      const struct bch_devs_mask *mask)
++{
++	struct bch_dev *ca = NULL;
++
++	while ((*iter = mask
++		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
++		: *iter) < c->sb.nr_devices &&
++	       !(ca = rcu_dereference_check(c->devs[*iter],
++					    lockdep_is_held(&c->state_lock))))
++		(*iter)++;
++
++	return ca;
++}
++
++#define for_each_member_device_rcu(ca, c, iter, mask)			\
++	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
++
++static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
++{
++	struct bch_dev *ca;
++
++	rcu_read_lock();
++	if ((ca = __bch2_next_dev(c, iter, NULL)))
++		percpu_ref_get(&ca->ref);
++	rcu_read_unlock();
++
++	return ca;
++}
++
++/*
++ * If you break early, you must drop your ref on the current device
++ */
++#define for_each_member_device(ca, c, iter)				\
++	for ((iter) = 0;						\
++	     (ca = bch2_get_next_dev(c, &(iter)));			\
++	     percpu_ref_put(&ca->ref), (iter)++)
++
++static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
++						      unsigned *iter,
++						      int state_mask)
++{
++	struct bch_dev *ca;
++
++	rcu_read_lock();
++	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
++	       (!((1 << ca->mi.state) & state_mask) ||
++		!percpu_ref_tryget(&ca->io_ref)))
++		(*iter)++;
++	rcu_read_unlock();
++
++	return ca;
++}
++
++#define __for_each_online_member(ca, c, iter, state_mask)		\
++	for ((iter) = 0;						\
++	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
++	     percpu_ref_put(&ca->io_ref), (iter)++)
++
++#define for_each_online_member(ca, c, iter)				\
++	__for_each_online_member(ca, c, iter, ~0)
++
++#define for_each_rw_member(ca, c, iter)					\
++	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
++
++#define for_each_readable_member(ca, c, iter)				\
++	__for_each_online_member(ca, c, iter,				\
++		(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
++
++/*
++ * If a key exists that references a device, the device won't be going away and
++ * we can omit rcu_read_lock():
++ */
++static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
++{
++	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
++
++	return rcu_dereference_check(c->devs[idx], 1);
++}
++
++static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
++{
++	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
++
++	return rcu_dereference_protected(c->devs[idx],
++					 lockdep_is_held(&c->sb_lock) ||
++					 lockdep_is_held(&c->state_lock));
++}
++
++/* XXX kill, move to struct bch_fs */
++static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
++{
++	struct bch_devs_mask devs;
++	struct bch_dev *ca;
++	unsigned i;
++
++	memset(&devs, 0, sizeof(devs));
++	for_each_online_member(ca, c, i)
++		__set_bit(ca->dev_idx, devs.d);
++	return devs;
++}
++
++static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
++{
++	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
++	u64 b_offset	= bucket_to_sector(ca, b);
++	u64 b_end	= bucket_to_sector(ca, b + 1);
++	unsigned i;
++
++	if (!b)
++		return true;
++
++	for (i = 0; i < layout->nr_superblocks; i++) {
++		u64 offset = le64_to_cpu(layout->sb_offset[i]);
++		u64 end = offset + (1 << layout->sb_max_size_bits);
++
++		if (!(offset >= b_end || end <= b_offset))
++			return true;
++	}
++
++	return false;
++}
++
++struct bch_fs *bch2_dev_to_fs(dev_t);
++struct bch_fs *bch2_uuid_to_fs(uuid_le);
++
++bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
++			   enum bch_member_state, int);
++int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
++			enum bch_member_state, int);
++int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
++		      enum bch_member_state, int);
++
++int bch2_dev_fail(struct bch_dev *, int);
++int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
++int bch2_dev_add(struct bch_fs *, const char *);
++int bch2_dev_online(struct bch_fs *, const char *);
++int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
++int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
++struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
++
++bool bch2_fs_emergency_read_only(struct bch_fs *);
++void bch2_fs_read_only(struct bch_fs *);
++
++int bch2_fs_read_write(struct bch_fs *);
++int bch2_fs_read_write_early(struct bch_fs *);
++
++/*
++ * Only for use in the recovery/fsck path:
++ */
++static inline void bch2_fs_lazy_rw(struct bch_fs *c)
++{
++	if (percpu_ref_is_zero(&c->writes))
++		bch2_fs_read_write_early(c);
++}
++
++void __bch2_fs_stop(struct bch_fs *);
++void bch2_fs_free(struct bch_fs *);
++void bch2_fs_stop(struct bch_fs *);
++
++int bch2_fs_start(struct bch_fs *);
++struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
++
++#endif /* _BCACHEFS_SUPER_H */
+diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
+new file mode 100644
+index 000000000000..89419fc7930d
+--- /dev/null
++++ b/fs/bcachefs/super_types.h
+@@ -0,0 +1,51 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_SUPER_TYPES_H
++#define _BCACHEFS_SUPER_TYPES_H
++
++struct bch_sb_handle {
++	struct bch_sb		*sb;
++	struct block_device	*bdev;
++	struct bio		*bio;
++	size_t			buffer_size;
++	fmode_t			mode;
++	unsigned		have_layout:1;
++	unsigned		have_bio:1;
++	unsigned		fs_sb:1;
++	u64			seq;
++};
++
++struct bch_devs_mask {
++	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
++};
++
++struct bch_devs_list {
++	u8			nr;
++	u8			devs[BCH_BKEY_PTRS_MAX];
++};
++
++struct bch_member_cpu {
++	u64			nbuckets;	/* device size */
++	u16			first_bucket;   /* index of first bucket used */
++	u16			bucket_size;	/* sectors */
++	u16			group;
++	u8			state;
++	u8			discard;
++	u8			data_allowed;
++	u8			durability;
++	u8			freespace_initialized;
++	u8			valid;
++};
++
++struct bch_disk_group_cpu {
++	bool				deleted;
++	u16				parent;
++	struct bch_devs_mask		devs;
++};
++
++struct bch_disk_groups_cpu {
++	struct rcu_head			rcu;
++	unsigned			nr;
++	struct bch_disk_group_cpu	entries[];
++};
++
++#endif /* _BCACHEFS_SUPER_TYPES_H */
+diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
+new file mode 100644
+index 000000000000..2c650055f530
+--- /dev/null
++++ b/fs/bcachefs/sysfs.c
+@@ -0,0 +1,943 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * bcache sysfs interfaces
++ *
++ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
++ * Copyright 2012 Google, Inc.
++ */
++
++#ifndef NO_BCACHEFS_SYSFS
++
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "alloc_foreground.h"
++#include "sysfs.h"
++#include "btree_cache.h"
++#include "btree_io.h"
++#include "btree_iter.h"
++#include "btree_key_cache.h"
++#include "btree_update.h"
++#include "btree_update_interior.h"
++#include "btree_gc.h"
++#include "buckets.h"
++#include "clock.h"
++#include "disk_groups.h"
++#include "ec.h"
++#include "inode.h"
++#include "journal.h"
++#include "keylist.h"
++#include "move.h"
++#include "opts.h"
++#include "rebalance.h"
++#include "replicas.h"
++#include "super-io.h"
++#include "tests.h"
++
++#include <linux/blkdev.h>
++#include <linux/pretty-printers.h>
++#include <linux/sort.h>
++#include <linux/sched/clock.h>
++
++#include "util.h"
++
++#define SYSFS_OPS(type)							\
++const struct sysfs_ops type ## _sysfs_ops = {					\
++	.show	= type ## _show,					\
++	.store	= type ## _store					\
++}
++
++#define SHOW(fn)							\
++static ssize_t fn ## _to_text(struct printbuf *,			\
++			      struct kobject *, struct attribute *);\
++									\
++static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
++			   char *buf)					\
++{									\
++	struct printbuf out = PRINTBUF;					\
++	ssize_t ret = fn ## _to_text(&out, kobj, attr);			\
++									\
++	if (out.pos && out.buf[out.pos - 1] != '\n')			\
++		prt_newline(&out);					\
++									\
++	if (!ret && out.allocation_failure)				\
++		ret = -ENOMEM;						\
++									\
++	if (!ret) {							\
++		ret = min_t(size_t, out.pos, PAGE_SIZE - 1);		\
++		memcpy(buf, out.buf, ret);				\
++	}								\
++	printbuf_exit(&out);						\
++	return ret;							\
++}									\
++									\
++static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
++			      struct attribute *attr)
++
++#define STORE(fn)							\
++static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
++			    const char *buf, size_t size)		\
++
++#define __sysfs_attribute(_name, _mode)					\
++	static struct attribute sysfs_##_name =				\
++		{ .name = #_name, .mode = _mode }
++
++#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
++#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
++#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
++
++#define sysfs_printf(file, fmt, ...)					\
++do {									\
++	if (attr == &sysfs_ ## file)					\
++		prt_printf(out, fmt "\n", __VA_ARGS__);			\
++} while (0)
++
++#define sysfs_print(file, var)						\
++do {									\
++	if (attr == &sysfs_ ## file)					\
++		snprint(out, var);					\
++} while (0)
++
++#define sysfs_hprint(file, val)						\
++do {									\
++	if (attr == &sysfs_ ## file)					\
++		prt_human_readable_s64(out, val);			\
++} while (0)
++
++#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
++#define var_print(_var)		sysfs_print(_var, var(_var))
++#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
++
++#define sysfs_strtoul(file, var)					\
++do {									\
++	if (attr == &sysfs_ ## file)					\
++		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
++} while (0)
++
++#define sysfs_strtoul_clamp(file, var, min, max)			\
++do {									\
++	if (attr == &sysfs_ ## file)					\
++		return strtoul_safe_clamp(buf, var, min, max)		\
++			?: (ssize_t) size;				\
++} while (0)
++
++#define strtoul_or_return(cp)						\
++({									\
++	unsigned long _v;						\
++	int _r = kstrtoul(cp, 10, &_v);					\
++	if (_r)								\
++		return _r;						\
++	_v;								\
++})
++
++#define strtoul_restrict_or_return(cp, min, max)			\
++({									\
++	unsigned long __v = 0;						\
++	int _r = strtoul_safe_restrict(cp, __v, min, max);		\
++	if (_r)								\
++		return _r;						\
++	__v;								\
++})
++
++#define strtoi_h_or_return(cp)						\
++({									\
++	u64 _v;								\
++	int _r = strtoi_h(cp, &_v);					\
++	if (_r)								\
++		return _r;						\
++	_v;								\
++})
++
++#define sysfs_hatoi(file, var)						\
++do {									\
++	if (attr == &sysfs_ ## file)					\
++		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
++} while (0)
++
++write_attribute(trigger_gc);
++write_attribute(trigger_discards);
++write_attribute(trigger_invalidates);
++write_attribute(prune_cache);
++rw_attribute(btree_gc_periodic);
++rw_attribute(gc_gens_pos);
++
++read_attribute(uuid);
++read_attribute(minor);
++read_attribute(bucket_size);
++read_attribute(first_bucket);
++read_attribute(nbuckets);
++read_attribute(durability);
++read_attribute(iodone);
++
++read_attribute(io_latency_read);
++read_attribute(io_latency_write);
++read_attribute(io_latency_stats_read);
++read_attribute(io_latency_stats_write);
++read_attribute(congested);
++
++read_attribute(btree_avg_write_size);
++
++read_attribute(btree_cache_size);
++read_attribute(compression_stats);
++read_attribute(journal_debug);
++read_attribute(btree_updates);
++read_attribute(btree_cache);
++read_attribute(btree_key_cache);
++read_attribute(stripes_heap);
++read_attribute(open_buckets);
++
++read_attribute(internal_uuid);
++
++read_attribute(has_data);
++read_attribute(alloc_debug);
++
++read_attribute(read_realloc_races);
++read_attribute(extent_migrate_done);
++read_attribute(extent_migrate_raced);
++read_attribute(bucket_alloc_fail);
++
++#define x(t, n, ...) read_attribute(t);
++BCH_PERSISTENT_COUNTERS()
++#undef x
++
++rw_attribute(discard);
++rw_attribute(label);
++
++rw_attribute(copy_gc_enabled);
++read_attribute(copy_gc_wait);
++
++rw_attribute(rebalance_enabled);
++sysfs_pd_controller_attribute(rebalance);
++read_attribute(rebalance_work);
++rw_attribute(promote_whole_extents);
++
++read_attribute(new_stripes);
++
++read_attribute(io_timers_read);
++read_attribute(io_timers_write);
++
++read_attribute(data_jobs);
++
++#ifdef CONFIG_BCACHEFS_TESTS
++write_attribute(perf_test);
++#endif /* CONFIG_BCACHEFS_TESTS */
++
++#define x(_name)						\
++	static struct attribute sysfs_time_stat_##_name =		\
++		{ .name = #_name, .mode = S_IRUGO };
++	BCH_TIME_STATS()
++#undef x
++
++static struct attribute sysfs_state_rw = {
++	.name = "state",
++	.mode = S_IRUGO
++};
++
++static size_t bch2_btree_cache_size(struct bch_fs *c)
++{
++	size_t ret = 0;
++	struct btree *b;
++
++	mutex_lock(&c->btree_cache.lock);
++	list_for_each_entry(b, &c->btree_cache.live, list)
++		ret += btree_bytes(c);
++
++	mutex_unlock(&c->btree_cache.lock);
++	return ret;
++}
++
++static size_t bch2_btree_avg_write_size(struct bch_fs *c)
++{
++	u64 nr = atomic64_read(&c->btree_writes_nr);
++	u64 sectors = atomic64_read(&c->btree_writes_sectors);
++
++	return nr ? div64_u64(sectors, nr) : 0;
++}
++
++static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	long ret = 0;
++	struct bch_move_stats *stats;
++
++	mutex_lock(&c->data_progress_lock);
++	list_for_each_entry(stats, &c->data_progress_list, list) {
++		prt_printf(out, "%s: data type %s btree_id %s position: ",
++		       stats->name,
++		       bch2_data_types[stats->data_type],
++		       bch2_btree_ids[stats->btree_id]);
++		bch2_bpos_to_text(out, stats->pos);
++		prt_printf(out, "%s", "\n");
++	}
++
++	mutex_unlock(&c->data_progress_lock);
++	return ret;
++}
++
++static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	enum btree_id id;
++	u64 nr_uncompressed_extents = 0,
++	    nr_compressed_extents = 0,
++	    nr_incompressible_extents = 0,
++	    uncompressed_sectors = 0,
++	    incompressible_sectors = 0,
++	    compressed_sectors_compressed = 0,
++	    compressed_sectors_uncompressed = 0;
++	int ret;
++
++	if (!test_bit(BCH_FS_STARTED, &c->flags))
++		return -EPERM;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for (id = 0; id < BTREE_ID_NR; id++) {
++		if (!((1U << id) & BTREE_ID_HAS_PTRS))
++			continue;
++
++		for_each_btree_key(&trans, iter, id, POS_MIN,
++				   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++			struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++			const union bch_extent_entry *entry;
++			struct extent_ptr_decoded p;
++			bool compressed = false, uncompressed = false, incompressible = false;
++
++			bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++				switch (p.crc.compression_type) {
++				case BCH_COMPRESSION_TYPE_none:
++					uncompressed = true;
++					uncompressed_sectors += k.k->size;
++					break;
++				case BCH_COMPRESSION_TYPE_incompressible:
++					incompressible = true;
++					incompressible_sectors += k.k->size;
++					break;
++				default:
++					compressed_sectors_compressed +=
++						p.crc.compressed_size;
++					compressed_sectors_uncompressed +=
++						p.crc.uncompressed_size;
++					compressed = true;
++					break;
++				}
++			}
++
++			if (incompressible)
++				nr_incompressible_extents++;
++			else if (uncompressed)
++				nr_uncompressed_extents++;
++			else if (compressed)
++				nr_compressed_extents++;
++		}
++		bch2_trans_iter_exit(&trans, &iter);
++	}
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		return ret;
++
++	prt_printf(out, "uncompressed:\n");
++	prt_printf(out, "	nr extents:		%llu\n", nr_uncompressed_extents);
++	prt_printf(out, "	size:			");
++	prt_human_readable_u64(out, uncompressed_sectors << 9);
++	prt_printf(out, "\n");
++
++	prt_printf(out, "compressed:\n");
++	prt_printf(out, "	nr extents:		%llu\n", nr_compressed_extents);
++	prt_printf(out, "	compressed size:	");
++	prt_human_readable_u64(out, compressed_sectors_compressed << 9);
++	prt_printf(out, "\n");
++	prt_printf(out, "	uncompressed size:	");
++	prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
++	prt_printf(out, "\n");
++
++	prt_printf(out, "incompressible:\n");
++	prt_printf(out, "	nr extents:		%llu\n", nr_incompressible_extents);
++	prt_printf(out, "	size:			");
++	prt_human_readable_u64(out, incompressible_sectors << 9);
++	prt_printf(out, "\n");
++	return 0;
++}
++
++static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
++	bch2_bpos_to_text(out, c->gc_gens_pos);
++	prt_printf(out, "\n");
++}
++
++SHOW(bch2_fs)
++{
++	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
++
++	sysfs_print(minor,			c->minor);
++	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
++
++	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
++	sysfs_hprint(btree_avg_write_size,	bch2_btree_avg_write_size(c));
++
++	sysfs_print(read_realloc_races,
++		    atomic_long_read(&c->read_realloc_races));
++	sysfs_print(extent_migrate_done,
++		    atomic_long_read(&c->extent_migrate_done));
++	sysfs_print(extent_migrate_raced,
++		    atomic_long_read(&c->extent_migrate_raced));
++	sysfs_print(bucket_alloc_fail,
++		    atomic_long_read(&c->bucket_alloc_fail));
++
++	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
++
++	if (attr == &sysfs_gc_gens_pos)
++		bch2_gc_gens_pos_to_text(out, c);
++
++	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
++
++	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
++	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
++	sysfs_hprint(copy_gc_wait,
++		     max(0LL, c->copygc_wait -
++			 atomic64_read(&c->io_clock[WRITE].now)) << 9);
++
++	if (attr == &sysfs_rebalance_work)
++		bch2_rebalance_work_to_text(out, c);
++
++	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
++
++	/* Debugging: */
++
++	if (attr == &sysfs_journal_debug)
++		bch2_journal_debug_to_text(out, &c->journal);
++
++	if (attr == &sysfs_btree_updates)
++		bch2_btree_updates_to_text(out, c);
++
++	if (attr == &sysfs_btree_cache)
++		bch2_btree_cache_to_text(out, c);
++
++	if (attr == &sysfs_btree_key_cache)
++		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
++
++	if (attr == &sysfs_stripes_heap)
++		bch2_stripes_heap_to_text(out, c);
++
++	if (attr == &sysfs_open_buckets)
++		bch2_open_buckets_to_text(out, c);
++
++	if (attr == &sysfs_compression_stats)
++		bch2_compression_stats_to_text(out, c);
++
++	if (attr == &sysfs_new_stripes)
++		bch2_new_stripes_to_text(out, c);
++
++	if (attr == &sysfs_io_timers_read)
++		bch2_io_timers_to_text(out, &c->io_clock[READ]);
++
++	if (attr == &sysfs_io_timers_write)
++		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
++
++	if (attr == &sysfs_data_jobs)
++		data_progress_to_text(out, c);
++
++	return 0;
++}
++
++STORE(bch2_fs)
++{
++	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
++
++	if (attr == &sysfs_btree_gc_periodic) {
++		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
++			?: (ssize_t) size;
++
++		wake_up_process(c->gc_thread);
++		return ret;
++	}
++
++	if (attr == &sysfs_copy_gc_enabled) {
++		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
++			?: (ssize_t) size;
++
++		if (c->copygc_thread)
++			wake_up_process(c->copygc_thread);
++		return ret;
++	}
++
++	if (attr == &sysfs_rebalance_enabled) {
++		ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
++			?: (ssize_t) size;
++
++		rebalance_wakeup(c);
++		return ret;
++	}
++
++	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
++
++	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
++
++	/* Debugging: */
++
++	if (!test_bit(BCH_FS_STARTED, &c->flags))
++		return -EPERM;
++
++	/* Debugging: */
++
++	if (!test_bit(BCH_FS_RW, &c->flags))
++		return -EROFS;
++
++	if (attr == &sysfs_prune_cache) {
++		struct shrink_control sc;
++
++		sc.gfp_mask = GFP_KERNEL;
++		sc.nr_to_scan = strtoul_or_return(buf);
++		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
++	}
++
++	if (attr == &sysfs_trigger_gc) {
++		/*
++		 * Full gc is currently incompatible with btree key cache:
++		 */
++#if 0
++		down_read(&c->state_lock);
++		bch2_gc(c, false, false);
++		up_read(&c->state_lock);
++#else
++		bch2_gc_gens(c);
++#endif
++	}
++
++	if (attr == &sysfs_trigger_discards)
++		bch2_do_discards(c);
++
++	if (attr == &sysfs_trigger_invalidates)
++		bch2_do_invalidates(c);
++
++#ifdef CONFIG_BCACHEFS_TESTS
++	if (attr == &sysfs_perf_test) {
++		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
++		char *test		= strsep(&p, " \t\n");
++		char *nr_str		= strsep(&p, " \t\n");
++		char *threads_str	= strsep(&p, " \t\n");
++		unsigned threads;
++		u64 nr;
++		int ret = -EINVAL;
++
++		if (threads_str &&
++		    !(ret = kstrtouint(threads_str, 10, &threads)) &&
++		    !(ret = bch2_strtoull_h(nr_str, &nr)))
++			ret = bch2_btree_perf_test(c, test, nr, threads);
++		kfree(tmp);
++
++		if (ret)
++			size = ret;
++	}
++#endif
++	return size;
++}
++SYSFS_OPS(bch2_fs);
++
++struct attribute *bch2_fs_files[] = {
++	&sysfs_minor,
++	&sysfs_btree_cache_size,
++	&sysfs_btree_avg_write_size,
++
++	&sysfs_promote_whole_extents,
++
++	&sysfs_compression_stats,
++
++#ifdef CONFIG_BCACHEFS_TESTS
++	&sysfs_perf_test,
++#endif
++	NULL
++};
++
++/* counters dir */
++
++SHOW(bch2_fs_counters)
++{
++	struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
++	u64 counter = 0;
++	u64 counter_since_mount = 0;
++
++	out->tabstops[0] = 32;
++	#define x(t, ...) \
++		if (attr == &sysfs_##t) {					\
++			counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
++			counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
++			prt_printf(out, "since mount:");				\
++			prt_tab(out);						\
++			prt_human_readable_u64(out, counter_since_mount << 9);	\
++			prt_newline(out);					\
++										\
++			prt_printf(out, "since filesystem creation:");		\
++			prt_tab(out);						\
++			prt_human_readable_u64(out, counter << 9);		\
++			prt_newline(out);					\
++		}
++	BCH_PERSISTENT_COUNTERS()
++	#undef x
++	return 0;
++}
++
++STORE(bch2_fs_counters) {
++	return 0;
++}
++
++SYSFS_OPS(bch2_fs_counters);
++
++struct attribute *bch2_fs_counters_files[] = {
++#define x(t, ...) \
++	&sysfs_##t,
++	BCH_PERSISTENT_COUNTERS()
++#undef x
++	NULL
++};
++/* internal dir - just a wrapper */
++
++SHOW(bch2_fs_internal)
++{
++	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
++	return bch2_fs_to_text(out, &c->kobj, attr);
++}
++
++STORE(bch2_fs_internal)
++{
++	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
++	return bch2_fs_store(&c->kobj, attr, buf, size);
++}
++SYSFS_OPS(bch2_fs_internal);
++
++struct attribute *bch2_fs_internal_files[] = {
++	&sysfs_journal_debug,
++	&sysfs_btree_updates,
++	&sysfs_btree_cache,
++	&sysfs_btree_key_cache,
++	&sysfs_new_stripes,
++	&sysfs_stripes_heap,
++	&sysfs_open_buckets,
++	&sysfs_io_timers_read,
++	&sysfs_io_timers_write,
++
++	&sysfs_trigger_gc,
++	&sysfs_trigger_discards,
++	&sysfs_trigger_invalidates,
++	&sysfs_prune_cache,
++
++	&sysfs_read_realloc_races,
++	&sysfs_extent_migrate_done,
++	&sysfs_extent_migrate_raced,
++	&sysfs_bucket_alloc_fail,
++
++	&sysfs_gc_gens_pos,
++
++	&sysfs_copy_gc_enabled,
++	&sysfs_copy_gc_wait,
++
++	&sysfs_rebalance_enabled,
++	&sysfs_rebalance_work,
++	sysfs_pd_controller_files(rebalance),
++
++	&sysfs_data_jobs,
++
++	&sysfs_internal_uuid,
++	NULL
++};
++
++/* options */
++
++SHOW(bch2_fs_opts_dir)
++{
++	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
++	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
++	int id = opt - bch2_opt_table;
++	u64 v = bch2_opt_get_by_id(&c->opts, id);
++
++	bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
++	prt_char(out, '\n');
++
++	return 0;
++}
++
++STORE(bch2_fs_opts_dir)
++{
++	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
++	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
++	int ret, id = opt - bch2_opt_table;
++	char *tmp;
++	u64 v;
++
++	/*
++	 * We don't need to take c->writes for correctness, but it eliminates an
++	 * unsightly error message in the dmesg log when we're RO:
++	 */
++	if (unlikely(!percpu_ref_tryget_live(&c->writes)))
++		return -EROFS;
++
++	tmp = kstrdup(buf, GFP_KERNEL);
++	if (!tmp) {
++		ret = -ENOMEM;
++		goto err;
++	}
++
++	ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
++	kfree(tmp);
++
++	if (ret < 0)
++		goto err;
++
++	ret = bch2_opt_check_may_set(c, id, v);
++	if (ret < 0)
++		goto err;
++
++	bch2_opt_set_sb(c, opt, v);
++	bch2_opt_set_by_id(&c->opts, id, v);
++
++	if ((id == Opt_background_target ||
++	     id == Opt_background_compression) && v) {
++		bch2_rebalance_add_work(c, S64_MAX);
++		rebalance_wakeup(c);
++	}
++
++	ret = size;
++err:
++	percpu_ref_put(&c->writes);
++	return ret;
++}
++SYSFS_OPS(bch2_fs_opts_dir);
++
++struct attribute *bch2_fs_opts_dir_files[] = { NULL };
++
++int bch2_opts_create_sysfs_files(struct kobject *kobj)
++{
++	const struct bch_option *i;
++	int ret;
++
++	for (i = bch2_opt_table;
++	     i < bch2_opt_table + bch2_opts_nr;
++	     i++) {
++		if (!(i->flags & OPT_FS))
++			continue;
++
++		ret = sysfs_create_file(kobj, &i->attr);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++/* time stats */
++
++SHOW(bch2_fs_time_stats)
++{
++	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
++
++#define x(name)								\
++	if (attr == &sysfs_time_stat_##name)				\
++		bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
++	BCH_TIME_STATS()
++#undef x
++
++	return 0;
++}
++
++STORE(bch2_fs_time_stats)
++{
++	return size;
++}
++SYSFS_OPS(bch2_fs_time_stats);
++
++struct attribute *bch2_fs_time_stats_files[] = {
++#define x(name)						\
++	&sysfs_time_stat_##name,
++	BCH_TIME_STATS()
++#undef x
++	NULL
++};
++
++static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
++{
++	struct bch_fs *c = ca->fs;
++	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
++	unsigned i, nr[BCH_DATA_NR];
++
++	memset(nr, 0, sizeof(nr));
++
++	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
++		nr[c->open_buckets[i].data_type]++;
++
++	prt_printf(out,
++	       "\t\t\t buckets\t sectors      fragmented\n"
++	       "capacity\t%16llu\n",
++	       ca->mi.nbuckets - ca->mi.first_bucket);
++
++	for (i = 0; i < BCH_DATA_NR; i++)
++		prt_printf(out, "%-16s%16llu%16llu%16llu\n",
++		       bch2_data_types[i], stats.d[i].buckets,
++		       stats.d[i].sectors, stats.d[i].fragmented);
++
++	prt_printf(out,
++	       "ec\t\t%16llu\n"
++	       "\n"
++	       "freelist_wait\t\t%s\n"
++	       "open buckets allocated\t%u\n"
++	       "open buckets this dev\t%u\n"
++	       "open buckets total\t%u\n"
++	       "open_buckets_wait\t%s\n"
++	       "open_buckets_btree\t%u\n"
++	       "open_buckets_user\t%u\n"
++	       "buckets_to_invalidate\t%llu\n"
++	       "btree reserve cache\t%u\n",
++	       stats.buckets_ec,
++	       c->freelist_wait.list.first		? "waiting" : "empty",
++	       OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
++	       ca->nr_open_buckets,
++	       OPEN_BUCKETS_COUNT,
++	       c->open_buckets_wait.list.first		? "waiting" : "empty",
++	       nr[BCH_DATA_btree],
++	       nr[BCH_DATA_user],
++	       should_invalidate_buckets(ca, stats),
++	       c->btree_reserve_cache_nr);
++}
++
++static const char * const bch2_rw[] = {
++	"read",
++	"write",
++	NULL
++};
++
++static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
++{
++	int rw, i;
++
++	for (rw = 0; rw < 2; rw++) {
++		prt_printf(out, "%s:\n", bch2_rw[rw]);
++
++		for (i = 1; i < BCH_DATA_NR; i++)
++			prt_printf(out, "%-12s:%12llu\n",
++			       bch2_data_types[i],
++			       percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
++	}
++}
++
++SHOW(bch2_dev)
++{
++	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
++	struct bch_fs *c = ca->fs;
++
++	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
++
++	sysfs_print(bucket_size,	bucket_bytes(ca));
++	sysfs_print(first_bucket,	ca->mi.first_bucket);
++	sysfs_print(nbuckets,		ca->mi.nbuckets);
++	sysfs_print(durability,		ca->mi.durability);
++	sysfs_print(discard,		ca->mi.discard);
++
++	if (attr == &sysfs_label) {
++		if (ca->mi.group) {
++			mutex_lock(&c->sb_lock);
++			bch2_disk_path_to_text(out, c->disk_sb.sb,
++					       ca->mi.group - 1);
++			mutex_unlock(&c->sb_lock);
++		}
++
++		prt_char(out, '\n');
++	}
++
++	if (attr == &sysfs_has_data) {
++		prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
++		prt_char(out, '\n');
++	}
++
++	if (attr == &sysfs_state_rw) {
++		prt_string_option(out, bch2_member_states, ca->mi.state);
++		prt_char(out, '\n');
++	}
++
++	if (attr == &sysfs_iodone)
++		dev_iodone_to_text(out, ca);
++
++	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
++	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
++
++	if (attr == &sysfs_io_latency_stats_read)
++		bch2_time_stats_to_text(out, &ca->io_latency[READ]);
++
++	if (attr == &sysfs_io_latency_stats_write)
++		bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
++
++	sysfs_printf(congested,			"%u%%",
++		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
++		     * 100 / CONGESTED_MAX);
++
++	if (attr == &sysfs_alloc_debug)
++		dev_alloc_debug_to_text(out, ca);
++
++	return 0;
++}
++
++STORE(bch2_dev)
++{
++	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
++	struct bch_fs *c = ca->fs;
++	struct bch_member *mi;
++
++	if (attr == &sysfs_discard) {
++		bool v = strtoul_or_return(buf);
++
++		mutex_lock(&c->sb_lock);
++		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
++
++		if (v != BCH_MEMBER_DISCARD(mi)) {
++			SET_BCH_MEMBER_DISCARD(mi, v);
++			bch2_write_super(c);
++		}
++		mutex_unlock(&c->sb_lock);
++	}
++
++	if (attr == &sysfs_label) {
++		char *tmp;
++		int ret;
++
++		tmp = kstrdup(buf, GFP_KERNEL);
++		if (!tmp)
++			return -ENOMEM;
++
++		ret = bch2_dev_group_set(c, ca, strim(tmp));
++		kfree(tmp);
++		if (ret)
++			return ret;
++	}
++
++	return size;
++}
++SYSFS_OPS(bch2_dev);
++
++struct attribute *bch2_dev_files[] = {
++	&sysfs_uuid,
++	&sysfs_bucket_size,
++	&sysfs_first_bucket,
++	&sysfs_nbuckets,
++	&sysfs_durability,
++
++	/* settings: */
++	&sysfs_discard,
++	&sysfs_state_rw,
++	&sysfs_label,
++
++	&sysfs_has_data,
++	&sysfs_iodone,
++
++	&sysfs_io_latency_read,
++	&sysfs_io_latency_write,
++	&sysfs_io_latency_stats_read,
++	&sysfs_io_latency_stats_write,
++	&sysfs_congested,
++
++	/* debug: */
++	&sysfs_alloc_debug,
++	NULL
++};
++
++#endif  /* _BCACHEFS_SYSFS_H_ */
+diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
+new file mode 100644
+index 000000000000..222cd5062702
+--- /dev/null
++++ b/fs/bcachefs/sysfs.h
+@@ -0,0 +1,48 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_SYSFS_H_
++#define _BCACHEFS_SYSFS_H_
++
++#include <linux/sysfs.h>
++
++#ifndef NO_BCACHEFS_SYSFS
++
++struct attribute;
++struct sysfs_ops;
++
++extern struct attribute *bch2_fs_files[];
++extern struct attribute *bch2_fs_counters_files[];
++extern struct attribute *bch2_fs_internal_files[];
++extern struct attribute *bch2_fs_opts_dir_files[];
++extern struct attribute *bch2_fs_time_stats_files[];
++extern struct attribute *bch2_dev_files[];
++
++extern const struct sysfs_ops bch2_fs_sysfs_ops;
++extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
++extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
++extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
++extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
++extern const struct sysfs_ops bch2_dev_sysfs_ops;
++
++int bch2_opts_create_sysfs_files(struct kobject *);
++
++#else
++
++static struct attribute *bch2_fs_files[] = {};
++static struct attribute *bch2_fs_counters_files[] = {};
++static struct attribute *bch2_fs_internal_files[] = {};
++static struct attribute *bch2_fs_opts_dir_files[] = {};
++static struct attribute *bch2_fs_time_stats_files[] = {};
++static struct attribute *bch2_dev_files[] = {};
++
++static const struct sysfs_ops bch2_fs_sysfs_ops;
++static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
++static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
++static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
++static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
++static const struct sysfs_ops bch2_dev_sysfs_ops;
++
++static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
++
++#endif /* NO_BCACHEFS_SYSFS */
++
++#endif  /* _BCACHEFS_SYSFS_H_ */
+diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
+new file mode 100644
+index 000000000000..56058a56f2a2
+--- /dev/null
++++ b/fs/bcachefs/tests.c
+@@ -0,0 +1,976 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifdef CONFIG_BCACHEFS_TESTS
++
++#include "bcachefs.h"
++#include "btree_update.h"
++#include "journal_reclaim.h"
++#include "subvolume.h"
++#include "tests.h"
++
++#include "linux/kthread.h"
++#include "linux/random.h"
++
++static void delete_test_keys(struct bch_fs *c)
++{
++	int ret;
++
++	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
++				      SPOS(0, 0, U32_MAX), SPOS_MAX,
++				      0,
++				      NULL);
++	BUG_ON(ret);
++
++	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
++				      SPOS(0, 0, U32_MAX), SPOS_MAX,
++				      0, NULL);
++	BUG_ON(ret);
++}
++
++/* unit tests */
++
++static int test_delete(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_i_cookie k;
++	int ret;
++
++	bkey_cookie_init(&k.k_i);
++	k.k.p.snapshot = U32_MAX;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
++			     BTREE_ITER_INTENT);
++
++	ret = commit_do(&trans, NULL, NULL, 0,
++		bch2_btree_iter_traverse(&iter) ?:
++		bch2_trans_update(&trans, &iter, &k.k_i, 0));
++	if (ret) {
++		bch_err(c, "update error in test_delete: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	pr_info("deleting once");
++	ret = commit_do(&trans, NULL, NULL, 0,
++		bch2_btree_iter_traverse(&iter) ?:
++		bch2_btree_delete_at(&trans, &iter, 0));
++	if (ret) {
++		bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	pr_info("deleting twice");
++	ret = commit_do(&trans, NULL, NULL, 0,
++		bch2_btree_iter_traverse(&iter) ?:
++		bch2_btree_delete_at(&trans, &iter, 0));
++	if (ret) {
++		bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret));
++		goto err;
++	}
++err:
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int test_delete_written(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_i_cookie k;
++	int ret;
++
++	bkey_cookie_init(&k.k_i);
++	k.k.p.snapshot = U32_MAX;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
++			     BTREE_ITER_INTENT);
++
++	ret = commit_do(&trans, NULL, NULL, 0,
++		bch2_btree_iter_traverse(&iter) ?:
++		bch2_trans_update(&trans, &iter, &k.k_i, 0));
++	if (ret) {
++		bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	bch2_trans_unlock(&trans);
++	bch2_journal_flush_all_pins(&c->journal);
++
++	ret = commit_do(&trans, NULL, NULL, 0,
++		bch2_btree_iter_traverse(&iter) ?:
++		bch2_btree_delete_at(&trans, &iter, 0));
++	if (ret) {
++		bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret));
++		goto err;
++	}
++err:
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int test_iterate(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter = { NULL };
++	struct bkey_s_c k;
++	u64 i;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	delete_test_keys(c);
++
++	pr_info("inserting test keys");
++
++	for (i = 0; i < nr; i++) {
++		struct bkey_i_cookie k;
++
++		bkey_cookie_init(&k.k_i);
++		k.k.p.offset = i;
++		k.k.p.snapshot = U32_MAX;
++
++		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
++					NULL, NULL, 0);
++		if (ret) {
++			bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret));
++			goto err;
++		}
++	}
++
++	pr_info("iterating forwards");
++
++	i = 0;
++
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
++				  SPOS(0, 0, U32_MAX), 0, k, ({
++		BUG_ON(k.k->p.offset != i++);
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
++		goto err;
++	}
++
++	BUG_ON(i != nr);
++
++	pr_info("iterating backwards");
++
++	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
++					 SPOS(0, U64_MAX, U32_MAX), 0, k,
++		({
++			BUG_ON(k.k->p.offset != --i);
++			0;
++		}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
++		goto err;
++	}
++
++	BUG_ON(i);
++err:
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int test_iterate_extents(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter = { NULL };
++	struct bkey_s_c k;
++	u64 i;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	delete_test_keys(c);
++
++	pr_info("inserting test extents");
++
++	for (i = 0; i < nr; i += 8) {
++		struct bkey_i_cookie k;
++
++		bkey_cookie_init(&k.k_i);
++		k.k.p.offset = i + 8;
++		k.k.p.snapshot = U32_MAX;
++		k.k.size = 8;
++
++		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
++					NULL, NULL, 0);
++		if (ret) {
++			bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret));
++			goto err;
++		}
++	}
++
++	pr_info("iterating forwards");
++
++	i = 0;
++
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
++				  SPOS(0, 0, U32_MAX), 0, k, ({
++		BUG_ON(bkey_start_offset(k.k) != i);
++		i = k.k->p.offset;
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
++		goto err;
++	}
++
++	BUG_ON(i != nr);
++
++	pr_info("iterating backwards");
++
++	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
++					 SPOS(0, U64_MAX, U32_MAX), 0, k,
++		({
++			BUG_ON(k.k->p.offset != i);
++			i = bkey_start_offset(k.k);
++			0;
++		}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
++		goto err;
++	}
++
++	BUG_ON(i);
++err:
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int test_iterate_slots(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter = { NULL };
++	struct bkey_s_c k;
++	u64 i;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	delete_test_keys(c);
++
++	pr_info("inserting test keys");
++
++	for (i = 0; i < nr; i++) {
++		struct bkey_i_cookie k;
++
++		bkey_cookie_init(&k.k_i);
++		k.k.p.offset = i * 2;
++		k.k.p.snapshot = U32_MAX;
++
++		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
++					NULL, NULL, 0);
++		if (ret) {
++			bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret));
++			goto err;
++		}
++	}
++
++	pr_info("iterating forwards");
++
++	i = 0;
++
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
++				  SPOS(0, 0, U32_MAX), 0, k, ({
++		BUG_ON(k.k->p.offset != i);
++		i += 2;
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
++		goto err;
++	}
++
++	BUG_ON(i != nr * 2);
++
++	pr_info("iterating forwards by slots");
++
++	i = 0;
++
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
++				  SPOS(0, 0, U32_MAX),
++				  BTREE_ITER_SLOTS, k, ({
++		if (i >= nr * 2)
++			break;
++
++		BUG_ON(k.k->p.offset != i);
++		BUG_ON(bkey_deleted(k.k) != (i & 1));
++
++		i++;
++		0;
++	}));
++	if (ret < 0) {
++		bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
++		goto err;
++	}
++	ret = 0;
++err:
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter = { NULL };
++	struct bkey_s_c k;
++	u64 i;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	delete_test_keys(c);
++
++	pr_info("inserting test keys");
++
++	for (i = 0; i < nr; i += 16) {
++		struct bkey_i_cookie k;
++
++		bkey_cookie_init(&k.k_i);
++		k.k.p.offset = i + 16;
++		k.k.p.snapshot = U32_MAX;
++		k.k.size = 8;
++
++		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
++					NULL, NULL, 0);
++		if (ret) {
++			bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret));
++			goto err;
++		}
++	}
++
++	pr_info("iterating forwards");
++
++	i = 0;
++
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
++				  SPOS(0, 0, U32_MAX), 0, k, ({
++		BUG_ON(bkey_start_offset(k.k) != i + 8);
++		BUG_ON(k.k->size != 8);
++		i += 16;
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
++		goto err;
++	}
++
++	BUG_ON(i != nr);
++
++	pr_info("iterating forwards by slots");
++
++	i = 0;
++
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
++				 SPOS(0, 0, U32_MAX),
++				 BTREE_ITER_SLOTS, k, ({
++		if (i == nr)
++			break;
++		BUG_ON(bkey_deleted(k.k) != !(i % 16));
++
++		BUG_ON(bkey_start_offset(k.k) != i);
++		BUG_ON(k.k->size != 8);
++		i = k.k->p.offset;
++		0;
++	}));
++	if (ret) {
++		bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
++		goto err;
++	}
++	ret = 0;
++err:
++	bch2_trans_exit(&trans);
++	return 0;
++}
++
++/*
++ * XXX: we really want to make sure we've got a btree with depth > 0 for these
++ * tests
++ */
++static int test_peek_end(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
++			     SPOS(0, 0, U32_MAX), 0);
++
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
++	BUG_ON(k.k);
++
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
++	BUG_ON(k.k);
++
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return 0;
++}
++
++static int test_peek_end_extents(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
++			     SPOS(0, 0, U32_MAX), 0);
++
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
++	BUG_ON(k.k);
++
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
++	BUG_ON(k.k);
++
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return 0;
++}
++
++/* extent unit tests */
++
++u64 test_version;
++
++static int insert_test_extent(struct bch_fs *c,
++			      u64 start, u64 end)
++{
++	struct bkey_i_cookie k;
++	int ret;
++
++	bkey_cookie_init(&k.k_i);
++	k.k_i.k.p.offset = end;
++	k.k_i.k.p.snapshot = U32_MAX;
++	k.k_i.k.size = end - start;
++	k.k_i.k.version.lo = test_version++;
++
++	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
++				NULL, NULL, 0);
++	if (ret)
++		bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret));
++	return ret;
++}
++
++static int __test_extent_overwrite(struct bch_fs *c,
++				    u64 e1_start, u64 e1_end,
++				    u64 e2_start, u64 e2_end)
++{
++	int ret;
++
++	ret   = insert_test_extent(c, e1_start, e1_end) ?:
++		insert_test_extent(c, e2_start, e2_end);
++
++	delete_test_keys(c);
++	return ret;
++}
++
++static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
++{
++	return  __test_extent_overwrite(c, 0, 64, 0, 32) ?:
++		__test_extent_overwrite(c, 8, 64, 0, 32);
++}
++
++static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
++{
++	return  __test_extent_overwrite(c, 0, 64, 32, 64) ?:
++		__test_extent_overwrite(c, 0, 64, 32, 72);
++}
++
++static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
++{
++	return __test_extent_overwrite(c, 0, 64, 32, 40);
++}
++
++static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
++{
++	return  __test_extent_overwrite(c, 32, 64,  0,  64) ?:
++		__test_extent_overwrite(c, 32, 64,  0, 128) ?:
++		__test_extent_overwrite(c, 32, 64, 32,  64) ?:
++		__test_extent_overwrite(c, 32, 64, 32, 128);
++}
++
++/* snapshot unit tests */
++
++/* Test skipping over keys in unrelated snapshots: */
++static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i_cookie cookie;
++	int ret;
++
++	bkey_cookie_init(&cookie.k_i);
++	cookie.k.p.snapshot = snapid_hi;
++	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
++				NULL, NULL, 0);
++	if (ret)
++		return ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
++			     SPOS(0, 0, snapid_lo), 0);
++	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
++
++	BUG_ON(k.k->p.snapshot != U32_MAX);
++
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int test_snapshots(struct bch_fs *c, u64 nr)
++{
++	struct bkey_i_cookie cookie;
++	u32 snapids[2];
++	u32 snapid_subvols[2] = { 1, 1 };
++	int ret;
++
++	bkey_cookie_init(&cookie.k_i);
++	cookie.k.p.snapshot = U32_MAX;
++	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
++				NULL, NULL, 0);
++	if (ret)
++		return ret;
++
++	ret = bch2_trans_do(c, NULL, NULL, 0,
++		      bch2_snapshot_node_create(&trans, U32_MAX,
++						snapids,
++						snapid_subvols,
++						2));
++	if (ret)
++		return ret;
++
++	if (snapids[0] > snapids[1])
++		swap(snapids[0], snapids[1]);
++
++	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
++	if (ret) {
++		bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret));
++		return ret;
++	}
++
++	return 0;
++}
++
++/* perf tests */
++
++static u64 test_rand(void)
++{
++	u64 v;
++#if 0
++	v = prandom_u32();
++#else
++	prandom_bytes(&v, sizeof(v));
++#endif
++	return v;
++}
++
++static int rand_insert(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct bkey_i_cookie k;
++	int ret = 0;
++	u64 i;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for (i = 0; i < nr; i++) {
++		bkey_cookie_init(&k.k_i);
++		k.k.p.offset = test_rand();
++		k.k.p.snapshot = U32_MAX;
++
++		ret = commit_do(&trans, NULL, NULL, 0,
++			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
++		if (ret) {
++			bch_err(c, "error in rand_insert: %s", bch2_err_str(ret));
++			break;
++		}
++	}
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int rand_insert_multi(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct bkey_i_cookie k[8];
++	int ret = 0;
++	unsigned j;
++	u64 i;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
++		for (j = 0; j < ARRAY_SIZE(k); j++) {
++			bkey_cookie_init(&k[j].k_i);
++			k[j].k.p.offset = test_rand();
++			k[j].k.p.snapshot = U32_MAX;
++		}
++
++		ret = commit_do(&trans, NULL, NULL, 0,
++			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
++			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
++			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
++			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?:
++			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?:
++			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?:
++			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
++			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
++		if (ret) {
++			bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret));
++			break;
++		}
++	}
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int rand_lookup(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++	u64 i;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
++			     SPOS(0, 0, U32_MAX), 0);
++
++	for (i = 0; i < nr; i++) {
++		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
++
++		lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
++		ret = bkey_err(k);
++		if (ret) {
++			bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret));
++			break;
++		}
++	}
++
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int rand_mixed_trans(struct btree_trans *trans,
++			    struct btree_iter *iter,
++			    struct bkey_i_cookie *cookie,
++			    u64 i, u64 pos)
++{
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
++
++	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(iter)));
++	ret = bkey_err(k);
++	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret));
++	if (ret)
++		return ret;
++
++	if (!(i & 3) && k.k) {
++		bkey_cookie_init(&cookie->k_i);
++		cookie->k.p = iter->pos;
++		ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
++	}
++
++	return ret;
++}
++
++static int rand_mixed(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_i_cookie cookie;
++	int ret = 0;
++	u64 i, rand;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
++			     SPOS(0, 0, U32_MAX), 0);
++
++	for (i = 0; i < nr; i++) {
++		rand = test_rand();
++		ret = commit_do(&trans, NULL, NULL, 0,
++			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
++		if (ret) {
++			bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret));
++			break;
++		}
++	}
++
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int __do_delete(struct btree_trans *trans, struct bpos pos)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
++			     BTREE_ITER_INTENT);
++	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (!k.k)
++		goto err;
++
++	ret = bch2_btree_delete_at(trans, &iter, 0);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int rand_delete(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	int ret = 0;
++	u64 i;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for (i = 0; i < nr; i++) {
++		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
++
++		ret = commit_do(&trans, NULL, NULL, 0,
++			__do_delete(&trans, pos));
++		if (ret) {
++			bch_err(c, "error in rand_delete: %s", bch2_err_str(ret));
++			break;
++		}
++	}
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int seq_insert(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i_cookie insert;
++	int ret = 0;
++
++	bkey_cookie_init(&insert.k_i);
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
++					SPOS(0, 0, U32_MAX),
++					BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
++					NULL, NULL, 0,
++		({
++			if (iter.pos.offset >= nr)
++				break;
++			insert.k.p = iter.pos;
++			bch2_trans_update(&trans, &iter, &insert.k_i, 0);
++		}));
++	if (ret)
++		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int seq_lookup(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
++				  SPOS(0, 0, U32_MAX), 0, k,
++		0);
++	if (ret)
++		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int seq_overwrite(struct bch_fs *c, u64 nr)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
++					SPOS(0, 0, U32_MAX),
++					BTREE_ITER_INTENT, k,
++					NULL, NULL, 0,
++		({
++			struct bkey_i_cookie u;
++
++			bkey_reassemble(&u.k_i, k);
++			bch2_trans_update(&trans, &iter, &u.k_i, 0);
++		}));
++	if (ret)
++		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int seq_delete(struct bch_fs *c, u64 nr)
++{
++	int ret;
++
++	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
++				      SPOS(0, 0, U32_MAX), SPOS_MAX,
++				      0, NULL);
++	if (ret)
++		bch_err(c, "error in seq_delete: %s", bch2_err_str(ret));
++	return ret;
++}
++
++typedef int (*perf_test_fn)(struct bch_fs *, u64);
++
++struct test_job {
++	struct bch_fs			*c;
++	u64				nr;
++	unsigned			nr_threads;
++	perf_test_fn			fn;
++
++	atomic_t			ready;
++	wait_queue_head_t		ready_wait;
++
++	atomic_t			done;
++	struct completion		done_completion;
++
++	u64				start;
++	u64				finish;
++	int				ret;
++};
++
++static int btree_perf_test_thread(void *data)
++{
++	struct test_job *j = data;
++	int ret;
++
++	if (atomic_dec_and_test(&j->ready)) {
++		wake_up(&j->ready_wait);
++		j->start = sched_clock();
++	} else {
++		wait_event(j->ready_wait, !atomic_read(&j->ready));
++	}
++
++	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
++	if (ret) {
++		bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
++		j->ret = ret;
++	}
++
++	if (atomic_dec_and_test(&j->done)) {
++		j->finish = sched_clock();
++		complete(&j->done_completion);
++	}
++
++	return 0;
++}
++
++int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
++			 u64 nr, unsigned nr_threads)
++{
++	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
++	char name_buf[20];
++	struct printbuf nr_buf = PRINTBUF;
++	struct printbuf per_sec_buf = PRINTBUF;
++	unsigned i;
++	u64 time;
++
++	atomic_set(&j.ready, nr_threads);
++	init_waitqueue_head(&j.ready_wait);
++
++	atomic_set(&j.done, nr_threads);
++	init_completion(&j.done_completion);
++
++#define perf_test(_test)				\
++	if (!strcmp(testname, #_test)) j.fn = _test
++
++	perf_test(rand_insert);
++	perf_test(rand_insert_multi);
++	perf_test(rand_lookup);
++	perf_test(rand_mixed);
++	perf_test(rand_delete);
++
++	perf_test(seq_insert);
++	perf_test(seq_lookup);
++	perf_test(seq_overwrite);
++	perf_test(seq_delete);
++
++	/* a unit test, not a perf test: */
++	perf_test(test_delete);
++	perf_test(test_delete_written);
++	perf_test(test_iterate);
++	perf_test(test_iterate_extents);
++	perf_test(test_iterate_slots);
++	perf_test(test_iterate_slots_extents);
++	perf_test(test_peek_end);
++	perf_test(test_peek_end_extents);
++
++	perf_test(test_extent_overwrite_front);
++	perf_test(test_extent_overwrite_back);
++	perf_test(test_extent_overwrite_middle);
++	perf_test(test_extent_overwrite_all);
++
++	perf_test(test_snapshots);
++
++	if (!j.fn) {
++		pr_err("unknown test %s", testname);
++		return -EINVAL;
++	}
++
++	//pr_info("running test %s:", testname);
++
++	if (nr_threads == 1)
++		btree_perf_test_thread(&j);
++	else
++		for (i = 0; i < nr_threads; i++)
++			kthread_run(btree_perf_test_thread, &j,
++				    "bcachefs perf test[%u]", i);
++
++	while (wait_for_completion_interruptible(&j.done_completion))
++		;
++
++	time = j.finish - j.start;
++
++	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
++	prt_human_readable_u64(&nr_buf, nr);
++	prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
++	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
++		name_buf, nr_buf.buf, nr_threads,
++		div_u64(time, NSEC_PER_SEC),
++		div_u64(time * nr_threads, nr),
++		per_sec_buf.buf);
++	printbuf_exit(&per_sec_buf);
++	printbuf_exit(&nr_buf);
++	return j.ret;
++}
++
++#endif /* CONFIG_BCACHEFS_TESTS */
+diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
+new file mode 100644
+index 000000000000..c73b18aea7e0
+--- /dev/null
++++ b/fs/bcachefs/tests.h
+@@ -0,0 +1,15 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_TEST_H
++#define _BCACHEFS_TEST_H
++
++struct bch_fs;
++
++#ifdef CONFIG_BCACHEFS_TESTS
++
++int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
++
++#else
++
++#endif /* CONFIG_BCACHEFS_TESTS */
++
++#endif /* _BCACHEFS_TEST_H */
+diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
+new file mode 100644
+index 000000000000..59e8dfa3d245
+--- /dev/null
++++ b/fs/bcachefs/trace.c
+@@ -0,0 +1,12 @@
++// SPDX-License-Identifier: GPL-2.0
++#include "bcachefs.h"
++#include "alloc_types.h"
++#include "buckets.h"
++#include "btree_types.h"
++#include "keylist.h"
++
++#include <linux/blktrace_api.h>
++#include "keylist.h"
++
++#define CREATE_TRACE_POINTS
++#include <trace/events/bcachefs.h>
+diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
+new file mode 100644
+index 000000000000..ee2c7d9e7050
+--- /dev/null
++++ b/fs/bcachefs/util.c
+@@ -0,0 +1,964 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * random utiility code, for bcache but in theory not specific to bcache
++ *
++ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
++ * Copyright 2012 Google, Inc.
++ */
++
++#include <linux/bio.h>
++#include <linux/blkdev.h>
++#include <linux/ctype.h>
++#include <linux/debugfs.h>
++#include <linux/freezer.h>
++#include <linux/kthread.h>
++#include <linux/log2.h>
++#include <linux/math64.h>
++#include <linux/percpu.h>
++#include <linux/preempt.h>
++#include <linux/random.h>
++#include <linux/seq_file.h>
++#include <linux/string.h>
++#include <linux/types.h>
++#include <linux/sched/clock.h>
++
++#include "eytzinger.h"
++#include "util.h"
++
++static const char si_units[] = "?kMGTPEZY";
++
++/* string_get_size units: */
++static const char *const units_2[] = {
++	"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
++};
++static const char *const units_10[] = {
++	"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
++};
++
++static int parse_u64(const char *cp, u64 *res)
++{
++	const char *start = cp;
++	u64 v = 0;
++
++	if (!isdigit(*cp))
++		return -EINVAL;
++
++	do {
++		if (v > U64_MAX / 10)
++			return -ERANGE;
++		v *= 10;
++		if (v > U64_MAX - (*cp - '0'))
++			return -ERANGE;
++		v += *cp - '0';
++		cp++;
++	} while (isdigit(*cp));
++
++	*res = v;
++	return cp - start;
++}
++
++static int bch2_pow(u64 n, u64 p, u64 *res)
++{
++	*res = 1;
++
++	while (p--) {
++		if (*res > div_u64(U64_MAX, n))
++			return -ERANGE;
++		*res *= n;
++	}
++	return 0;
++}
++
++static int parse_unit_suffix(const char *cp, u64 *res)
++{
++	const char *start = cp;
++	u64 base = 1024;
++	unsigned u;
++	int ret;
++
++	if (*cp == ' ')
++		cp++;
++
++	for (u = 1; u < strlen(si_units); u++)
++		if (*cp == si_units[u]) {
++			cp++;
++			goto got_unit;
++		}
++
++	for (u = 0; u < ARRAY_SIZE(units_2); u++)
++		if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
++			cp += strlen(units_2[u]);
++			goto got_unit;
++		}
++
++	for (u = 0; u < ARRAY_SIZE(units_10); u++)
++		if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
++			cp += strlen(units_10[u]);
++			base = 1000;
++			goto got_unit;
++		}
++
++	*res = 1;
++	return 0;
++got_unit:
++	ret = bch2_pow(base, u, res);
++	if (ret)
++		return ret;
++
++	return cp - start;
++}
++
++#define parse_or_ret(cp, _f)			\
++do {						\
++	int ret = _f;				\
++	if (ret < 0)				\
++		return ret;			\
++	cp += ret;				\
++} while (0)
++
++static int __bch2_strtou64_h(const char *cp, u64 *res)
++{
++	const char *start = cp;
++	u64 v = 0, b, f_n = 0, f_d = 1;
++	int ret;
++
++	parse_or_ret(cp, parse_u64(cp, &v));
++
++	if (*cp == '.') {
++		cp++;
++		ret = parse_u64(cp, &f_n);
++		if (ret < 0)
++			return ret;
++		cp += ret;
++
++		ret = bch2_pow(10, ret, &f_d);
++		if (ret)
++			return ret;
++	}
++
++	parse_or_ret(cp, parse_unit_suffix(cp, &b));
++
++	if (v > div_u64(U64_MAX, b))
++		return -ERANGE;
++	v *= b;
++
++	if (f_n > div_u64(U64_MAX, b))
++		return -ERANGE;
++
++	f_n = div_u64(f_n * b, f_d);
++	if (v + f_n < v)
++		return -ERANGE;
++	v += f_n;
++
++	*res = v;
++	return cp - start;
++}
++
++static int __bch2_strtoh(const char *cp, u64 *res,
++			 u64 t_max, bool t_signed)
++{
++	bool positive = *cp != '-';
++	u64 v = 0;
++
++	if (*cp == '+' || *cp == '-')
++		cp++;
++
++	parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
++
++	if (*cp == '\n')
++		cp++;
++	if (*cp)
++		return -EINVAL;
++
++	if (positive) {
++		if (v > t_max)
++			return -ERANGE;
++	} else {
++		if (v && !t_signed)
++			return -ERANGE;
++
++		if (v > t_max + 1)
++			return -ERANGE;
++		v = -v;
++	}
++
++	*res = v;
++	return 0;
++}
++
++#define STRTO_H(name, type)					\
++int bch2_ ## name ## _h(const char *cp, type *res)		\
++{								\
++	u64 v = 0;						\
++	int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),	\
++			ANYSINT_MAX(type) != ((type) ~0ULL));	\
++	*res = v;						\
++	return ret;						\
++}
++
++STRTO_H(strtoint, int)
++STRTO_H(strtouint, unsigned int)
++STRTO_H(strtoll, long long)
++STRTO_H(strtoull, unsigned long long)
++STRTO_H(strtou64, u64)
++
++u64 bch2_read_flag_list(char *opt, const char * const list[])
++{
++	u64 ret = 0;
++	char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
++
++	if (!d)
++		return -ENOMEM;
++
++	s = strim(d);
++
++	while ((p = strsep(&s, ","))) {
++		int flag = match_string(list, -1, p);
++		if (flag < 0) {
++			ret = -1;
++			break;
++		}
++
++		ret |= 1 << flag;
++	}
++
++	kfree(d);
++
++	return ret;
++}
++
++bool bch2_is_zero(const void *_p, size_t n)
++{
++	const char *p = _p;
++	size_t i;
++
++	for (i = 0; i < n; i++)
++		if (p[i])
++			return false;
++	return true;
++}
++
++static void bch2_quantiles_update(struct quantiles *q, u64 v)
++{
++	unsigned i = 0;
++
++	while (i < ARRAY_SIZE(q->entries)) {
++		struct quantile_entry *e = q->entries + i;
++
++		if (unlikely(!e->step)) {
++			e->m = v;
++			e->step = max_t(unsigned, v / 2, 1024);
++		} else if (e->m > v) {
++			e->m = e->m >= e->step
++				? e->m - e->step
++				: 0;
++		} else if (e->m < v) {
++			e->m = e->m + e->step > e->m
++				? e->m + e->step
++				: U32_MAX;
++		}
++
++		if ((e->m > v ? e->m - v : v - e->m) < e->step)
++			e->step = max_t(unsigned, e->step / 2, 1);
++
++		if (v >= e->m)
++			break;
++
++		i = eytzinger0_child(i, v > e->m);
++	}
++}
++
++/* time stats: */
++
++static void bch2_time_stats_update_one(struct time_stats *stats,
++				       u64 start, u64 end)
++{
++	u64 duration, freq;
++
++	duration	= time_after64(end, start)
++		? end - start : 0;
++	freq		= time_after64(end, stats->last_event)
++		? end - stats->last_event : 0;
++
++	stats->count++;
++
++	stats->average_duration = stats->average_duration
++		? ewma_add(stats->average_duration, duration, 6)
++		: duration;
++
++	stats->average_frequency = stats->average_frequency
++		? ewma_add(stats->average_frequency, freq, 6)
++		: freq;
++
++	stats->max_duration = max(stats->max_duration, duration);
++
++	stats->last_event = end;
++
++	bch2_quantiles_update(&stats->quantiles, duration);
++}
++
++void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
++{
++	unsigned long flags;
++
++	if (!stats->buffer) {
++		spin_lock_irqsave(&stats->lock, flags);
++		bch2_time_stats_update_one(stats, start, end);
++
++		if (stats->average_frequency < 32 &&
++		    stats->count > 1024)
++			stats->buffer =
++				alloc_percpu_gfp(struct time_stat_buffer,
++						 GFP_ATOMIC);
++		spin_unlock_irqrestore(&stats->lock, flags);
++	} else {
++		struct time_stat_buffer_entry *i;
++		struct time_stat_buffer *b;
++
++		preempt_disable();
++		b = this_cpu_ptr(stats->buffer);
++
++		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
++		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
++			.start = start,
++			.end = end
++		};
++
++		if (b->nr == ARRAY_SIZE(b->entries)) {
++			spin_lock_irqsave(&stats->lock, flags);
++			for (i = b->entries;
++			     i < b->entries + ARRAY_SIZE(b->entries);
++			     i++)
++				bch2_time_stats_update_one(stats, i->start, i->end);
++			spin_unlock_irqrestore(&stats->lock, flags);
++
++			b->nr = 0;
++		}
++
++		preempt_enable();
++	}
++}
++
++static const struct time_unit {
++	const char	*name;
++	u32		nsecs;
++} time_units[] = {
++	{ "ns",		1		},
++	{ "us",		NSEC_PER_USEC	},
++	{ "ms",		NSEC_PER_MSEC	},
++	{ "sec",	NSEC_PER_SEC	},
++};
++
++static const struct time_unit *pick_time_units(u64 ns)
++{
++	const struct time_unit *u;
++
++	for (u = time_units;
++	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
++	     ns >= u[1].nsecs << 1;
++	     u++)
++		;
++
++	return u;
++}
++
++static void pr_time_units(struct printbuf *out, u64 ns)
++{
++	const struct time_unit *u = pick_time_units(ns);
++
++	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
++}
++
++void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
++{
++	const struct time_unit *u;
++	u64 freq = READ_ONCE(stats->average_frequency);
++	u64 q, last_q = 0;
++	int i;
++
++	prt_printf(out, "count:\t\t%llu",
++			 stats->count);
++	prt_newline(out);
++	prt_printf(out, "rate:\t\t%llu/sec",
++	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
++	prt_newline(out);
++
++	prt_printf(out, "frequency:\t");
++	pr_time_units(out, freq);
++
++	prt_newline(out);
++	prt_printf(out, "avg duration:\t");
++	pr_time_units(out, stats->average_duration);
++
++	prt_newline(out);
++	prt_printf(out, "max duration:\t");
++	pr_time_units(out, stats->max_duration);
++
++	i = eytzinger0_first(NR_QUANTILES);
++	u = pick_time_units(stats->quantiles.entries[i].m);
++
++	prt_newline(out);
++	prt_printf(out, "quantiles (%s):\t", u->name);
++	eytzinger0_for_each(i, NR_QUANTILES) {
++		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
++
++		q = max(stats->quantiles.entries[i].m, last_q);
++		prt_printf(out, "%llu ",
++		       div_u64(q, u->nsecs));
++		if (is_last)
++			prt_newline(out);
++		last_q = q;
++	}
++}
++
++void bch2_time_stats_exit(struct time_stats *stats)
++{
++	free_percpu(stats->buffer);
++}
++
++void bch2_time_stats_init(struct time_stats *stats)
++{
++	memset(stats, 0, sizeof(*stats));
++	spin_lock_init(&stats->lock);
++}
++
++/* ratelimit: */
++
++/**
++ * bch2_ratelimit_delay() - return how long to delay until the next time to do
++ * some work
++ *
++ * @d - the struct bch_ratelimit to update
++ *
++ * Returns the amount of time to delay by, in jiffies
++ */
++u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
++{
++	u64 now = local_clock();
++
++	return time_after64(d->next, now)
++		? nsecs_to_jiffies(d->next - now)
++		: 0;
++}
++
++/**
++ * bch2_ratelimit_increment() - increment @d by the amount of work done
++ *
++ * @d - the struct bch_ratelimit to update
++ * @done - the amount of work done, in arbitrary units
++ */
++void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
++{
++	u64 now = local_clock();
++
++	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
++
++	if (time_before64(now + NSEC_PER_SEC, d->next))
++		d->next = now + NSEC_PER_SEC;
++
++	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
++		d->next = now - NSEC_PER_SEC * 2;
++}
++
++/* pd controller: */
++
++/*
++ * Updates pd_controller. Attempts to scale inputed values to units per second.
++ * @target: desired value
++ * @actual: current value
++ *
++ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
++ * it makes actual go down.
++ */
++void bch2_pd_controller_update(struct bch_pd_controller *pd,
++			      s64 target, s64 actual, int sign)
++{
++	s64 proportional, derivative, change;
++
++	unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
++
++	if (seconds_since_update == 0)
++		return;
++
++	pd->last_update = jiffies;
++
++	proportional = actual - target;
++	proportional *= seconds_since_update;
++	proportional = div_s64(proportional, pd->p_term_inverse);
++
++	derivative = actual - pd->last_actual;
++	derivative = div_s64(derivative, seconds_since_update);
++	derivative = ewma_add(pd->smoothed_derivative, derivative,
++			      (pd->d_term / seconds_since_update) ?: 1);
++	derivative = derivative * pd->d_term;
++	derivative = div_s64(derivative, pd->p_term_inverse);
++
++	change = proportional + derivative;
++
++	/* Don't increase rate if not keeping up */
++	if (change > 0 &&
++	    pd->backpressure &&
++	    time_after64(local_clock(),
++			 pd->rate.next + NSEC_PER_MSEC))
++		change = 0;
++
++	change *= (sign * -1);
++
++	pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
++				1, UINT_MAX);
++
++	pd->last_actual		= actual;
++	pd->last_derivative	= derivative;
++	pd->last_proportional	= proportional;
++	pd->last_change		= change;
++	pd->last_target		= target;
++}
++
++void bch2_pd_controller_init(struct bch_pd_controller *pd)
++{
++	pd->rate.rate		= 1024;
++	pd->last_update		= jiffies;
++	pd->p_term_inverse	= 6000;
++	pd->d_term		= 30;
++	pd->d_smooth		= pd->d_term;
++	pd->backpressure	= 1;
++}
++
++void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
++{
++	out->tabstops[0] = 20;
++
++	prt_printf(out, "rate:");
++	prt_tab(out);
++	prt_human_readable_s64(out, pd->rate.rate);
++	prt_newline(out);
++
++	prt_printf(out, "target:");
++	prt_tab(out);
++	prt_human_readable_u64(out, pd->last_target);
++	prt_newline(out);
++
++	prt_printf(out, "actual:");
++	prt_tab(out);
++	prt_human_readable_u64(out, pd->last_actual);
++	prt_newline(out);
++
++	prt_printf(out, "proportional:");
++	prt_tab(out);
++	prt_human_readable_s64(out, pd->last_proportional);
++	prt_newline(out);
++
++	prt_printf(out, "derivative:");
++	prt_tab(out);
++	prt_human_readable_s64(out, pd->last_derivative);
++	prt_newline(out);
++
++	prt_printf(out, "change:");
++	prt_tab(out);
++	prt_human_readable_s64(out, pd->last_change);
++	prt_newline(out);
++
++	prt_printf(out, "next io:");
++	prt_tab(out);
++	prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
++	prt_newline(out);
++}
++
++/* misc: */
++
++void bch2_bio_map(struct bio *bio, void *base, size_t size)
++{
++	while (size) {
++		struct page *page = is_vmalloc_addr(base)
++				? vmalloc_to_page(base)
++				: virt_to_page(base);
++		unsigned offset = offset_in_page(base);
++		unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
++
++		BUG_ON(!bio_add_page(bio, page, len, offset));
++		size -= len;
++		base += len;
++	}
++}
++
++int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
++{
++	while (size) {
++		struct page *page = alloc_page(gfp_mask);
++		unsigned len = min_t(size_t, PAGE_SIZE, size);
++
++		if (!page)
++			return -ENOMEM;
++
++		if (unlikely(!bio_add_page(bio, page, len, 0))) {
++			__free_page(page);
++			break;
++		}
++
++		size -= len;
++	}
++
++	return 0;
++}
++
++size_t bch2_rand_range(size_t max)
++{
++	size_t rand;
++
++	if (!max)
++		return 0;
++
++	do {
++		rand = get_random_long();
++		rand &= roundup_pow_of_two(max) - 1;
++	} while (rand >= max);
++
++	return rand;
++}
++
++void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
++{
++	struct bio_vec bv;
++	struct bvec_iter iter;
++
++	__bio_for_each_segment(bv, dst, iter, dst_iter) {
++		void *dstp = kmap_atomic(bv.bv_page);
++		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
++		kunmap_atomic(dstp);
++
++		src += bv.bv_len;
++	}
++}
++
++void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
++{
++	struct bio_vec bv;
++	struct bvec_iter iter;
++
++	__bio_for_each_segment(bv, src, iter, src_iter) {
++		void *srcp = kmap_atomic(bv.bv_page);
++		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
++		kunmap_atomic(srcp);
++
++		dst += bv.bv_len;
++	}
++}
++
++#include "eytzinger.h"
++
++static int alignment_ok(const void *base, size_t align)
++{
++	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
++		((unsigned long)base & (align - 1)) == 0;
++}
++
++static void u32_swap(void *a, void *b, size_t size)
++{
++	u32 t = *(u32 *)a;
++	*(u32 *)a = *(u32 *)b;
++	*(u32 *)b = t;
++}
++
++static void u64_swap(void *a, void *b, size_t size)
++{
++	u64 t = *(u64 *)a;
++	*(u64 *)a = *(u64 *)b;
++	*(u64 *)b = t;
++}
++
++static void generic_swap(void *a, void *b, size_t size)
++{
++	char t;
++
++	do {
++		t = *(char *)a;
++		*(char *)a++ = *(char *)b;
++		*(char *)b++ = t;
++	} while (--size > 0);
++}
++
++static inline int do_cmp(void *base, size_t n, size_t size,
++			 int (*cmp_func)(const void *, const void *, size_t),
++			 size_t l, size_t r)
++{
++	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
++			base + inorder_to_eytzinger0(r, n) * size,
++			size);
++}
++
++static inline void do_swap(void *base, size_t n, size_t size,
++			   void (*swap_func)(void *, void *, size_t),
++			   size_t l, size_t r)
++{
++	swap_func(base + inorder_to_eytzinger0(l, n) * size,
++		  base + inorder_to_eytzinger0(r, n) * size,
++		  size);
++}
++
++void eytzinger0_sort(void *base, size_t n, size_t size,
++		     int (*cmp_func)(const void *, const void *, size_t),
++		     void (*swap_func)(void *, void *, size_t))
++{
++	int i, c, r;
++
++	if (!swap_func) {
++		if (size == 4 && alignment_ok(base, 4))
++			swap_func = u32_swap;
++		else if (size == 8 && alignment_ok(base, 8))
++			swap_func = u64_swap;
++		else
++			swap_func = generic_swap;
++	}
++
++	/* heapify */
++	for (i = n / 2 - 1; i >= 0; --i) {
++		for (r = i; r * 2 + 1 < n; r = c) {
++			c = r * 2 + 1;
++
++			if (c + 1 < n &&
++			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
++				c++;
++
++			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
++				break;
++
++			do_swap(base, n, size, swap_func, r, c);
++		}
++	}
++
++	/* sort */
++	for (i = n - 1; i > 0; --i) {
++		do_swap(base, n, size, swap_func, 0, i);
++
++		for (r = 0; r * 2 + 1 < i; r = c) {
++			c = r * 2 + 1;
++
++			if (c + 1 < i &&
++			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
++				c++;
++
++			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
++				break;
++
++			do_swap(base, n, size, swap_func, r, c);
++		}
++	}
++}
++
++void sort_cmp_size(void *base, size_t num, size_t size,
++	  int (*cmp_func)(const void *, const void *, size_t),
++	  void (*swap_func)(void *, void *, size_t size))
++{
++	/* pre-scale counters for performance */
++	int i = (num/2 - 1) * size, n = num * size, c, r;
++
++	if (!swap_func) {
++		if (size == 4 && alignment_ok(base, 4))
++			swap_func = u32_swap;
++		else if (size == 8 && alignment_ok(base, 8))
++			swap_func = u64_swap;
++		else
++			swap_func = generic_swap;
++	}
++
++	/* heapify */
++	for ( ; i >= 0; i -= size) {
++		for (r = i; r * 2 + size < n; r  = c) {
++			c = r * 2 + size;
++			if (c < n - size &&
++			    cmp_func(base + c, base + c + size, size) < 0)
++				c += size;
++			if (cmp_func(base + r, base + c, size) >= 0)
++				break;
++			swap_func(base + r, base + c, size);
++		}
++	}
++
++	/* sort */
++	for (i = n - size; i > 0; i -= size) {
++		swap_func(base, base + i, size);
++		for (r = 0; r * 2 + size < i; r = c) {
++			c = r * 2 + size;
++			if (c < i - size &&
++			    cmp_func(base + c, base + c + size, size) < 0)
++				c += size;
++			if (cmp_func(base + r, base + c, size) >= 0)
++				break;
++			swap_func(base + r, base + c, size);
++		}
++	}
++}
++
++static void mempool_free_vp(void *element, void *pool_data)
++{
++	size_t size = (size_t) pool_data;
++
++	vpfree(element, size);
++}
++
++static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
++{
++	size_t size = (size_t) pool_data;
++
++	return vpmalloc(size, gfp_mask);
++}
++
++int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
++{
++	return size < PAGE_SIZE
++		? mempool_init_kmalloc_pool(pool, min_nr, size)
++		: mempool_init(pool, min_nr, mempool_alloc_vp,
++			       mempool_free_vp, (void *) size);
++}
++
++#if 0
++void eytzinger1_test(void)
++{
++	unsigned inorder, eytz, size;
++
++	pr_info("1 based eytzinger test:");
++
++	for (size = 2;
++	     size < 65536;
++	     size++) {
++		unsigned extra = eytzinger1_extra(size);
++
++		if (!(size % 4096))
++			pr_info("tree size %u", size);
++
++		BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
++		BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
++
++		BUG_ON(eytzinger1_prev(eytzinger1_first(size), size)	!= 0);
++		BUG_ON(eytzinger1_next(eytzinger1_last(size), size)	!= 0);
++
++		inorder = 1;
++		eytzinger1_for_each(eytz, size) {
++			BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
++			BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
++			BUG_ON(eytz != eytzinger1_last(size) &&
++			       eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
++
++			inorder++;
++		}
++	}
++}
++
++void eytzinger0_test(void)
++{
++
++	unsigned inorder, eytz, size;
++
++	pr_info("0 based eytzinger test:");
++
++	for (size = 1;
++	     size < 65536;
++	     size++) {
++		unsigned extra = eytzinger0_extra(size);
++
++		if (!(size % 4096))
++			pr_info("tree size %u", size);
++
++		BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
++		BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
++
++		BUG_ON(eytzinger0_prev(eytzinger0_first(size), size)	!= -1);
++		BUG_ON(eytzinger0_next(eytzinger0_last(size), size)	!= -1);
++
++		inorder = 0;
++		eytzinger0_for_each(eytz, size) {
++			BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
++			BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
++			BUG_ON(eytz != eytzinger0_last(size) &&
++			       eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
++
++			inorder++;
++		}
++	}
++}
++
++static inline int cmp_u16(const void *_l, const void *_r, size_t size)
++{
++	const u16 *l = _l, *r = _r;
++
++	return (*l > *r) - (*r - *l);
++}
++
++static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
++{
++	int i, c1 = -1, c2 = -1;
++	ssize_t r;
++
++	r = eytzinger0_find_le(test_array, nr,
++			       sizeof(test_array[0]),
++			       cmp_u16, &search);
++	if (r >= 0)
++		c1 = test_array[r];
++
++	for (i = 0; i < nr; i++)
++		if (test_array[i] <= search && test_array[i] > c2)
++			c2 = test_array[i];
++
++	if (c1 != c2) {
++		eytzinger0_for_each(i, nr)
++			pr_info("[%3u] = %12u", i, test_array[i]);
++		pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
++			i, r, c1, c2);
++	}
++}
++
++void eytzinger0_find_test(void)
++{
++	unsigned i, nr, allocated = 1 << 12;
++	u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
++
++	for (nr = 1; nr < allocated; nr++) {
++		pr_info("testing %u elems", nr);
++
++		get_random_bytes(test_array, nr * sizeof(test_array[0]));
++		eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
++
++		/* verify array is sorted correctly: */
++		eytzinger0_for_each(i, nr)
++			BUG_ON(i != eytzinger0_last(nr) &&
++			       test_array[i] > test_array[eytzinger0_next(i, nr)]);
++
++		for (i = 0; i < U16_MAX; i += 1 << 12)
++			eytzinger0_find_test_val(test_array, nr, i);
++
++		for (i = 0; i < nr; i++) {
++			eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
++			eytzinger0_find_test_val(test_array, nr, test_array[i]);
++			eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
++		}
++	}
++
++	kfree(test_array);
++}
++#endif
++
++/*
++ * Accumulate percpu counters onto one cpu's copy - only valid when access
++ * against any percpu counter is guarded against
++ */
++u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
++{
++	u64 *ret;
++	int cpu;
++
++	/* access to pcpu vars has to be blocked by other locking */
++	preempt_disable();
++	ret = this_cpu_ptr(p);
++	preempt_enable();
++
++	for_each_possible_cpu(cpu) {
++		u64 *i = per_cpu_ptr(p, cpu);
++
++		if (i != ret) {
++			acc_u64s(ret, i, nr);
++			memset(i, 0, nr * sizeof(u64));
++		}
++	}
++
++	return ret;
++}
+diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
+new file mode 100644
+index 000000000000..1fe66fd91ccc
+--- /dev/null
++++ b/fs/bcachefs/util.h
+@@ -0,0 +1,783 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_UTIL_H
++#define _BCACHEFS_UTIL_H
++
++#include <linux/bio.h>
++#include <linux/blkdev.h>
++#include <linux/closure.h>
++#include <linux/errno.h>
++#include <linux/freezer.h>
++#include <linux/kernel.h>
++#include <linux/sched/clock.h>
++#include <linux/llist.h>
++#include <linux/log2.h>
++#include <linux/printbuf.h>
++#include <linux/percpu.h>
++#include <linux/preempt.h>
++#include <linux/ratelimit.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/workqueue.h>
++
++struct closure;
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++
++#define EBUG_ON(cond)		BUG_ON(cond)
++#define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
++#define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
++#define atomic_sub_bug(i, v)	BUG_ON(atomic_sub_return(i, v) < 0)
++#define atomic_add_bug(i, v)	BUG_ON(atomic_add_return(i, v) < 0)
++#define atomic_long_dec_bug(v)		BUG_ON(atomic_long_dec_return(v) < 0)
++#define atomic_long_sub_bug(i, v)	BUG_ON(atomic_long_sub_return(i, v) < 0)
++#define atomic64_dec_bug(v)	BUG_ON(atomic64_dec_return(v) < 0)
++#define atomic64_inc_bug(v, i)	BUG_ON(atomic64_inc_return(v) <= i)
++#define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
++#define atomic64_add_bug(i, v)	BUG_ON(atomic64_add_return(i, v) < 0)
++
++#else /* DEBUG */
++
++#define EBUG_ON(cond)
++#define atomic_dec_bug(v)	atomic_dec(v)
++#define atomic_inc_bug(v, i)	atomic_inc(v)
++#define atomic_sub_bug(i, v)	atomic_sub(i, v)
++#define atomic_add_bug(i, v)	atomic_add(i, v)
++#define atomic_long_dec_bug(v)		atomic_long_dec(v)
++#define atomic_long_sub_bug(i, v)	atomic_long_sub(i, v)
++#define atomic64_dec_bug(v)	atomic64_dec(v)
++#define atomic64_inc_bug(v, i)	atomic64_inc(v)
++#define atomic64_sub_bug(i, v)	atomic64_sub(i, v)
++#define atomic64_add_bug(i, v)	atomic64_add(i, v)
++
++#endif
++
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++#define CPU_BIG_ENDIAN		0
++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++#define CPU_BIG_ENDIAN		1
++#endif
++
++/* type hackery */
++
++#define type_is_exact(_val, _type)					\
++	__builtin_types_compatible_p(typeof(_val), _type)
++
++#define type_is(_val, _type)						\
++	(__builtin_types_compatible_p(typeof(_val), _type) ||		\
++	 __builtin_types_compatible_p(typeof(_val), const _type))
++
++/* Userspace doesn't align allocations as nicely as the kernel allocators: */
++static inline size_t buf_pages(void *p, size_t len)
++{
++	return DIV_ROUND_UP(len +
++			    ((unsigned long) p & (PAGE_SIZE - 1)),
++			    PAGE_SIZE);
++}
++
++static inline void vpfree(void *p, size_t size)
++{
++	if (is_vmalloc_addr(p))
++		vfree(p);
++	else
++		free_pages((unsigned long) p, get_order(size));
++}
++
++static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
++{
++	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
++					 get_order(size)) ?:
++		__vmalloc(size, gfp_mask);
++}
++
++static inline void kvpfree(void *p, size_t size)
++{
++	if (size < PAGE_SIZE)
++		kfree(p);
++	else
++		vpfree(p, size);
++}
++
++static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
++{
++	return size < PAGE_SIZE
++		? kmalloc(size, gfp_mask)
++		: vpmalloc(size, gfp_mask);
++}
++
++int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
++
++#define HEAP(type)							\
++struct {								\
++	size_t size, used;						\
++	type *data;							\
++}
++
++#define DECLARE_HEAP(type, name) HEAP(type) name
++
++#define init_heap(heap, _size, gfp)					\
++({									\
++	(heap)->used = 0;						\
++	(heap)->size = (_size);						\
++	(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
++				 (gfp));				\
++})
++
++#define free_heap(heap)							\
++do {									\
++	kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));	\
++	(heap)->data = NULL;						\
++} while (0)
++
++#define heap_set_backpointer(h, i, _fn)					\
++do {									\
++	void (*fn)(typeof(h), size_t) = _fn;				\
++	if (fn)								\
++		fn(h, i);						\
++} while (0)
++
++#define heap_swap(h, i, j, set_backpointer)				\
++do {									\
++	swap((h)->data[i], (h)->data[j]);				\
++	heap_set_backpointer(h, i, set_backpointer);			\
++	heap_set_backpointer(h, j, set_backpointer);			\
++} while (0)
++
++#define heap_peek(h)							\
++({									\
++	EBUG_ON(!(h)->used);						\
++	(h)->data[0];							\
++})
++
++#define heap_full(h)	((h)->used == (h)->size)
++
++#define heap_sift_down(h, i, cmp, set_backpointer)			\
++do {									\
++	size_t _c, _j = i;						\
++									\
++	for (; _j * 2 + 1 < (h)->used; _j = _c) {			\
++		_c = _j * 2 + 1;					\
++		if (_c + 1 < (h)->used &&				\
++		    cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0)	\
++			_c++;						\
++									\
++		if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0)		\
++			break;						\
++		heap_swap(h, _c, _j, set_backpointer);			\
++	}								\
++} while (0)
++
++#define heap_sift_up(h, i, cmp, set_backpointer)			\
++do {									\
++	while (i) {							\
++		size_t p = (i - 1) / 2;					\
++		if (cmp(h, (h)->data[i], (h)->data[p]) >= 0)		\
++			break;						\
++		heap_swap(h, i, p, set_backpointer);			\
++		i = p;							\
++	}								\
++} while (0)
++
++#define __heap_add(h, d, cmp, set_backpointer)				\
++({									\
++	size_t _i = (h)->used++;					\
++	(h)->data[_i] = d;						\
++	heap_set_backpointer(h, _i, set_backpointer);			\
++									\
++	heap_sift_up(h, _i, cmp, set_backpointer);			\
++	_i;								\
++})
++
++#define heap_add(h, d, cmp, set_backpointer)				\
++({									\
++	bool _r = !heap_full(h);					\
++	if (_r)								\
++		__heap_add(h, d, cmp, set_backpointer);			\
++	_r;								\
++})
++
++#define heap_add_or_replace(h, new, cmp, set_backpointer)		\
++do {									\
++	if (!heap_add(h, new, cmp, set_backpointer) &&			\
++	    cmp(h, new, heap_peek(h)) >= 0) {				\
++		(h)->data[0] = new;					\
++		heap_set_backpointer(h, 0, set_backpointer);		\
++		heap_sift_down(h, 0, cmp, set_backpointer);		\
++	}								\
++} while (0)
++
++#define heap_del(h, i, cmp, set_backpointer)				\
++do {									\
++	size_t _i = (i);						\
++									\
++	BUG_ON(_i >= (h)->used);					\
++	(h)->used--;							\
++	if ((_i) < (h)->used) {						\
++		heap_swap(h, _i, (h)->used, set_backpointer);		\
++		heap_sift_up(h, _i, cmp, set_backpointer);		\
++		heap_sift_down(h, _i, cmp, set_backpointer);		\
++	}								\
++} while (0)
++
++#define heap_pop(h, d, cmp, set_backpointer)				\
++({									\
++	bool _r = (h)->used;						\
++	if (_r) {							\
++		(d) = (h)->data[0];					\
++		heap_del(h, 0, cmp, set_backpointer);			\
++	}								\
++	_r;								\
++})
++
++#define heap_resort(heap, cmp, set_backpointer)				\
++do {									\
++	ssize_t _i;							\
++	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
++		heap_sift_down(heap, _i, cmp, set_backpointer);		\
++} while (0)
++
++#define ANYSINT_MAX(t)							\
++	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
++
++
++#ifdef __KERNEL__
++static inline void pr_time(struct printbuf *out, u64 time)
++{
++	prt_printf(out, "%llu", time);
++}
++#else
++#include <time.h>
++static inline void pr_time(struct printbuf *out, u64 _time)
++{
++	char time_str[64];
++	time_t time = _time;
++	struct tm *tm = localtime(&time);
++	size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
++	if (!err)
++		prt_printf(out, "(formatting error)");
++	else
++		prt_printf(out, "%s", time_str);
++}
++#endif
++
++#ifdef __KERNEL__
++static inline void uuid_unparse_lower(u8 *uuid, char *out)
++{
++	sprintf(out, "%pUb", uuid);
++}
++#else
++#include <uuid/uuid.h>
++#endif
++
++static inline void pr_uuid(struct printbuf *out, u8 *uuid)
++{
++	char uuid_str[40];
++
++	uuid_unparse_lower(uuid, uuid_str);
++	prt_printf(out, "%s", uuid_str);
++}
++
++int bch2_strtoint_h(const char *, int *);
++int bch2_strtouint_h(const char *, unsigned int *);
++int bch2_strtoll_h(const char *, long long *);
++int bch2_strtoull_h(const char *, unsigned long long *);
++int bch2_strtou64_h(const char *, u64 *);
++
++static inline int bch2_strtol_h(const char *cp, long *res)
++{
++#if BITS_PER_LONG == 32
++	return bch2_strtoint_h(cp, (int *) res);
++#else
++	return bch2_strtoll_h(cp, (long long *) res);
++#endif
++}
++
++static inline int bch2_strtoul_h(const char *cp, long *res)
++{
++#if BITS_PER_LONG == 32
++	return bch2_strtouint_h(cp, (unsigned int *) res);
++#else
++	return bch2_strtoull_h(cp, (unsigned long long *) res);
++#endif
++}
++
++#define strtoi_h(cp, res)						\
++	( type_is(*res, int)		? bch2_strtoint_h(cp, (void *) res)\
++	: type_is(*res, long)		? bch2_strtol_h(cp, (void *) res)\
++	: type_is(*res, long long)	? bch2_strtoll_h(cp, (void *) res)\
++	: type_is(*res, unsigned)	? bch2_strtouint_h(cp, (void *) res)\
++	: type_is(*res, unsigned long)	? bch2_strtoul_h(cp, (void *) res)\
++	: type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
++	: -EINVAL)
++
++#define strtoul_safe(cp, var)						\
++({									\
++	unsigned long _v;						\
++	int _r = kstrtoul(cp, 10, &_v);					\
++	if (!_r)							\
++		var = _v;						\
++	_r;								\
++})
++
++#define strtoul_safe_clamp(cp, var, min, max)				\
++({									\
++	unsigned long _v;						\
++	int _r = kstrtoul(cp, 10, &_v);					\
++	if (!_r)							\
++		var = clamp_t(typeof(var), _v, min, max);		\
++	_r;								\
++})
++
++#define strtoul_safe_restrict(cp, var, min, max)			\
++({									\
++	unsigned long _v;						\
++	int _r = kstrtoul(cp, 10, &_v);					\
++	if (!_r && _v >= min && _v <= max)				\
++		var = _v;						\
++	else								\
++		_r = -EINVAL;						\
++	_r;								\
++})
++
++#define snprint(out, var)						\
++	prt_printf(out,							\
++		   type_is(var, int)		? "%i\n"		\
++		 : type_is(var, unsigned)	? "%u\n"		\
++		 : type_is(var, long)		? "%li\n"		\
++		 : type_is(var, unsigned long)	? "%lu\n"		\
++		 : type_is(var, s64)		? "%lli\n"		\
++		 : type_is(var, u64)		? "%llu\n"		\
++		 : type_is(var, char *)		? "%s\n"		\
++		 : "%i\n", var)
++
++bool bch2_is_zero(const void *, size_t);
++
++u64 bch2_read_flag_list(char *, const char * const[]);
++
++#define NR_QUANTILES	15
++#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
++#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
++#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
++
++struct quantiles {
++	struct quantile_entry {
++		u64	m;
++		u64	step;
++	}		entries[NR_QUANTILES];
++};
++
++struct time_stat_buffer {
++	unsigned	nr;
++	struct time_stat_buffer_entry {
++		u64	start;
++		u64	end;
++	}		entries[32];
++};
++
++struct time_stats {
++	spinlock_t	lock;
++	u64		count;
++	/* all fields are in nanoseconds */
++	u64		average_duration;
++	u64		average_frequency;
++	u64		max_duration;
++	u64		last_event;
++	struct quantiles quantiles;
++
++	struct time_stat_buffer __percpu *buffer;
++};
++
++void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
++
++static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
++{
++	__bch2_time_stats_update(stats, start, local_clock());
++}
++
++void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
++
++void bch2_time_stats_exit(struct time_stats *);
++void bch2_time_stats_init(struct time_stats *);
++
++#define ewma_add(ewma, val, weight)					\
++({									\
++	typeof(ewma) _ewma = (ewma);					\
++	typeof(weight) _weight = (weight);				\
++									\
++	(((_ewma << _weight) - _ewma) + (val)) >> _weight;		\
++})
++
++struct bch_ratelimit {
++	/* Next time we want to do some work, in nanoseconds */
++	u64			next;
++
++	/*
++	 * Rate at which we want to do work, in units per nanosecond
++	 * The units here correspond to the units passed to
++	 * bch2_ratelimit_increment()
++	 */
++	unsigned		rate;
++};
++
++static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
++{
++	d->next = local_clock();
++}
++
++u64 bch2_ratelimit_delay(struct bch_ratelimit *);
++void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
++
++struct bch_pd_controller {
++	struct bch_ratelimit	rate;
++	unsigned long		last_update;
++
++	s64			last_actual;
++	s64			smoothed_derivative;
++
++	unsigned		p_term_inverse;
++	unsigned		d_smooth;
++	unsigned		d_term;
++
++	/* for exporting to sysfs (no effect on behavior) */
++	s64			last_derivative;
++	s64			last_proportional;
++	s64			last_change;
++	s64			last_target;
++
++	/* If true, the rate will not increase if bch2_ratelimit_delay()
++	 * is not being called often enough. */
++	bool			backpressure;
++};
++
++void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
++void bch2_pd_controller_init(struct bch_pd_controller *);
++void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
++
++#define sysfs_pd_controller_attribute(name)				\
++	rw_attribute(name##_rate);					\
++	rw_attribute(name##_rate_bytes);				\
++	rw_attribute(name##_rate_d_term);				\
++	rw_attribute(name##_rate_p_term_inverse);			\
++	read_attribute(name##_rate_debug)
++
++#define sysfs_pd_controller_files(name)					\
++	&sysfs_##name##_rate,						\
++	&sysfs_##name##_rate_bytes,					\
++	&sysfs_##name##_rate_d_term,					\
++	&sysfs_##name##_rate_p_term_inverse,				\
++	&sysfs_##name##_rate_debug
++
++#define sysfs_pd_controller_show(name, var)				\
++do {									\
++	sysfs_hprint(name##_rate,		(var)->rate.rate);	\
++	sysfs_print(name##_rate_bytes,		(var)->rate.rate);	\
++	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
++	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
++									\
++	if (attr == &sysfs_##name##_rate_debug)				\
++		bch2_pd_controller_debug_to_text(out, var);		\
++} while (0)
++
++#define sysfs_pd_controller_store(name, var)				\
++do {									\
++	sysfs_strtoul_clamp(name##_rate,				\
++			    (var)->rate.rate, 1, UINT_MAX);		\
++	sysfs_strtoul_clamp(name##_rate_bytes,				\
++			    (var)->rate.rate, 1, UINT_MAX);		\
++	sysfs_strtoul(name##_rate_d_term,	(var)->d_term);		\
++	sysfs_strtoul_clamp(name##_rate_p_term_inverse,			\
++			    (var)->p_term_inverse, 1, INT_MAX);		\
++} while (0)
++
++#define container_of_or_null(ptr, type, member)				\
++({									\
++	typeof(ptr) _ptr = ptr;						\
++	_ptr ? container_of(_ptr, type, member) : NULL;			\
++})
++
++/* Does linear interpolation between powers of two */
++static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
++{
++	unsigned fract = x & ~(~0 << fract_bits);
++
++	x >>= fract_bits;
++	x   = 1 << x;
++	x  += (x * fract) >> fract_bits;
++
++	return x;
++}
++
++void bch2_bio_map(struct bio *bio, void *base, size_t);
++int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
++
++static inline sector_t bdev_sectors(struct block_device *bdev)
++{
++	return bdev->bd_inode->i_size >> 9;
++}
++
++#define closure_bio_submit(bio, cl)					\
++do {									\
++	closure_get(cl);						\
++	submit_bio(bio);						\
++} while (0)
++
++#define kthread_wait_freezable(cond)					\
++({									\
++	int _ret = 0;							\
++	while (1) {							\
++		set_current_state(TASK_INTERRUPTIBLE);			\
++		if (kthread_should_stop()) {				\
++			_ret = -1;					\
++			break;						\
++		}							\
++									\
++		if (cond)						\
++			break;						\
++									\
++		schedule();						\
++		try_to_freeze();					\
++	}								\
++	set_current_state(TASK_RUNNING);				\
++	_ret;								\
++})
++
++size_t bch2_rand_range(size_t);
++
++void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
++void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
++
++static inline void memcpy_u64s_small(void *dst, const void *src,
++				     unsigned u64s)
++{
++	u64 *d = dst;
++	const u64 *s = src;
++
++	while (u64s--)
++		*d++ = *s++;
++}
++
++static inline void __memcpy_u64s(void *dst, const void *src,
++				 unsigned u64s)
++{
++#ifdef CONFIG_X86_64
++	long d0, d1, d2;
++	asm volatile("rep ; movsq"
++		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
++		     : "0" (u64s), "1" (dst), "2" (src)
++		     : "memory");
++#else
++	u64 *d = dst;
++	const u64 *s = src;
++
++	while (u64s--)
++		*d++ = *s++;
++#endif
++}
++
++static inline void memcpy_u64s(void *dst, const void *src,
++			       unsigned u64s)
++{
++	EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
++		 dst + u64s * sizeof(u64) <= src));
++
++	__memcpy_u64s(dst, src, u64s);
++}
++
++static inline void __memmove_u64s_down(void *dst, const void *src,
++				       unsigned u64s)
++{
++	__memcpy_u64s(dst, src, u64s);
++}
++
++static inline void memmove_u64s_down(void *dst, const void *src,
++				     unsigned u64s)
++{
++	EBUG_ON(dst > src);
++
++	__memmove_u64s_down(dst, src, u64s);
++}
++
++static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
++					   unsigned u64s)
++{
++	u64 *dst = (u64 *) _dst + u64s;
++	u64 *src = (u64 *) _src + u64s;
++
++	while (u64s--)
++		*--dst = *--src;
++}
++
++static inline void memmove_u64s_up_small(void *dst, const void *src,
++					 unsigned u64s)
++{
++	EBUG_ON(dst < src);
++
++	__memmove_u64s_up_small(dst, src, u64s);
++}
++
++static inline void __memmove_u64s_up(void *_dst, const void *_src,
++				     unsigned u64s)
++{
++	u64 *dst = (u64 *) _dst + u64s - 1;
++	u64 *src = (u64 *) _src + u64s - 1;
++
++#ifdef CONFIG_X86_64
++	long d0, d1, d2;
++	asm volatile("std ;\n"
++		     "rep ; movsq\n"
++		     "cld ;\n"
++		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
++		     : "0" (u64s), "1" (dst), "2" (src)
++		     : "memory");
++#else
++	while (u64s--)
++		*dst-- = *src--;
++#endif
++}
++
++static inline void memmove_u64s_up(void *dst, const void *src,
++				   unsigned u64s)
++{
++	EBUG_ON(dst < src);
++
++	__memmove_u64s_up(dst, src, u64s);
++}
++
++static inline void memmove_u64s(void *dst, const void *src,
++				unsigned u64s)
++{
++	if (dst < src)
++		__memmove_u64s_down(dst, src, u64s);
++	else
++		__memmove_u64s_up(dst, src, u64s);
++}
++
++/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
++static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
++{
++	unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
++
++	memset(s + bytes, c, rem);
++}
++
++void sort_cmp_size(void *base, size_t num, size_t size,
++	  int (*cmp_func)(const void *, const void *, size_t),
++	  void (*swap_func)(void *, void *, size_t));
++
++/* just the memmove, doesn't update @_nr */
++#define __array_insert_item(_array, _nr, _pos)				\
++	memmove(&(_array)[(_pos) + 1],					\
++		&(_array)[(_pos)],					\
++		sizeof((_array)[0]) * ((_nr) - (_pos)))
++
++#define array_insert_item(_array, _nr, _pos, _new_item)			\
++do {									\
++	__array_insert_item(_array, _nr, _pos);				\
++	(_nr)++;							\
++	(_array)[(_pos)] = (_new_item);					\
++} while (0)
++
++#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
++do {									\
++	(_nr) -= (_nr_to_remove);					\
++	memmove(&(_array)[(_pos)],					\
++		&(_array)[(_pos) + (_nr_to_remove)],			\
++		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
++} while (0)
++
++#define array_remove_item(_array, _nr, _pos)				\
++	array_remove_items(_array, _nr, _pos, 1)
++
++static inline void __move_gap(void *array, size_t element_size,
++			      size_t nr, size_t size,
++			      size_t old_gap, size_t new_gap)
++{
++	size_t gap_end = old_gap + size - nr;
++
++	if (new_gap < old_gap) {
++		size_t move = old_gap - new_gap;
++
++		memmove(array + element_size * (gap_end - move),
++			array + element_size * (old_gap - move),
++				element_size * move);
++	} else if (new_gap > old_gap) {
++		size_t move = new_gap - old_gap;
++
++		memmove(array + element_size * old_gap,
++			array + element_size * gap_end,
++				element_size * move);
++	}
++}
++
++/* Move the gap in a gap buffer: */
++#define move_gap(_array, _nr, _size, _old_gap, _new_gap)	\
++	__move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
++
++#define bubble_sort(_base, _nr, _cmp)					\
++do {									\
++	ssize_t _i, _end;						\
++	bool _swapped = true;						\
++									\
++	for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
++		_swapped = false;					\
++		for (_i = 0; _i < _end; _i++)				\
++			if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {	\
++				swap((_base)[_i], (_base)[_i + 1]);	\
++				_swapped = true;			\
++			}						\
++	}								\
++} while (0)
++
++static inline u64 percpu_u64_get(u64 __percpu *src)
++{
++	u64 ret = 0;
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		ret += *per_cpu_ptr(src, cpu);
++	return ret;
++}
++
++static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
++{
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		*per_cpu_ptr(dst, cpu) = 0;
++	this_cpu_write(*dst, src);
++}
++
++static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
++{
++	unsigned i;
++
++	for (i = 0; i < nr; i++)
++		acc[i] += src[i];
++}
++
++static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
++				   unsigned nr)
++{
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
++}
++
++static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
++{
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		memset(per_cpu_ptr(p, cpu), c, bytes);
++}
++
++u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
++
++#define cmp_int(l, r)		((l > r) - (l < r))
++
++static inline int u8_cmp(u8 l, u8 r)
++{
++	return cmp_int(l, r);
++}
++
++#endif /* _BCACHEFS_UTIL_H */
+diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
+new file mode 100644
+index 000000000000..5143b603bf67
+--- /dev/null
++++ b/fs/bcachefs/varint.c
+@@ -0,0 +1,121 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include <linux/bitops.h>
++#include <linux/math.h>
++#include <linux/string.h>
++#include <asm/unaligned.h>
++
++#ifdef CONFIG_VALGRIND
++#include <valgrind/memcheck.h>
++#endif
++
++#include "varint.h"
++
++/**
++ * bch2_varint_encode - encode a variable length integer
++ * @out - destination to encode to
++ * @v	- unsigned integer to encode
++ *
++ * Returns the size in bytes of the encoded integer - at most 9 bytes
++ */
++int bch2_varint_encode(u8 *out, u64 v)
++{
++	unsigned bits = fls64(v|1);
++	unsigned bytes = DIV_ROUND_UP(bits, 7);
++
++	if (likely(bytes < 9)) {
++		v <<= bytes;
++		v |= ~(~0 << (bytes - 1));
++		v = cpu_to_le64(v);
++		memcpy(out, &v, bytes);
++	} else {
++		*out++ = 255;
++		bytes = 9;
++		put_unaligned_le64(v, out);
++	}
++
++	return bytes;
++}
++
++/**
++ * bch2_varint_decode - encode a variable length integer
++ * @in	- varint to decode
++ * @end	- end of buffer to decode from
++ * @out	- on success, decoded integer
++ *
++ * Returns the size in bytes of the decoded integer - or -1 on failure (would
++ * have read past the end of the buffer)
++ */
++int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
++{
++	unsigned bytes = likely(in < end)
++		? ffz(*in & 255) + 1
++		: 1;
++	u64 v;
++
++	if (unlikely(in + bytes > end))
++		return -1;
++
++	if (likely(bytes < 9)) {
++		v = 0;
++		memcpy(&v, in, bytes);
++		v = le64_to_cpu(v);
++		v >>= bytes;
++	} else {
++		v = get_unaligned_le64(++in);
++	}
++
++	*out = v;
++	return bytes;
++}
++
++/**
++ * bch2_varint_encode_fast - fast version of bch2_varint_encode
++ *
++ * This version assumes it's always safe to write 8 bytes to @out, even if the
++ * encoded integer would be smaller.
++ */
++int bch2_varint_encode_fast(u8 *out, u64 v)
++{
++	unsigned bits = fls64(v|1);
++	unsigned bytes = DIV_ROUND_UP(bits, 7);
++
++	if (likely(bytes < 9)) {
++		v <<= bytes;
++		v |= ~(~0 << (bytes - 1));
++	} else {
++		*out++ = 255;
++		bytes = 9;
++	}
++
++	put_unaligned_le64(v, out);
++	return bytes;
++}
++
++/**
++ * bch2_varint_decode_fast - fast version of bch2_varint_decode
++ *
++ * This version assumes that it is safe to read at most 8 bytes past the end of
++ * @end (we still return an error if the varint extends past @end).
++ */
++int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
++{
++#ifdef CONFIG_VALGRIND
++	VALGRIND_MAKE_MEM_DEFINED(in, 8);
++#endif
++	u64 v = get_unaligned_le64(in);
++	unsigned bytes = ffz(*in) + 1;
++
++	if (unlikely(in + bytes > end))
++		return -1;
++
++	if (likely(bytes < 9)) {
++		v >>= bytes;
++		v &= ~(~0ULL << (7 * bytes));
++	} else {
++		v = get_unaligned_le64(++in);
++	}
++
++	*out = v;
++	return bytes;
++}
+diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
+new file mode 100644
+index 000000000000..92a182fb3d7a
+--- /dev/null
++++ b/fs/bcachefs/varint.h
+@@ -0,0 +1,11 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_VARINT_H
++#define _BCACHEFS_VARINT_H
++
++int bch2_varint_encode(u8 *, u64);
++int bch2_varint_decode(const u8 *, const u8 *, u64 *);
++
++int bch2_varint_encode_fast(u8 *, u64);
++int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
++
++#endif /* _BCACHEFS_VARINT_H */
+diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
+new file mode 100644
+index 000000000000..53a694d71967
+--- /dev/null
++++ b/fs/bcachefs/vstructs.h
+@@ -0,0 +1,63 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _VSTRUCTS_H
++#define _VSTRUCTS_H
++
++#include "util.h"
++
++/*
++ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
++ * assumes u64 is little endian:
++ */
++#define __vstruct_u64s(_s)						\
++({									\
++	( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)		\
++	: type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)		\
++	: type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)		\
++	: ((__force u8) ((_s)->u64s)));						\
++})
++
++#define __vstruct_bytes(_type, _u64s)					\
++({									\
++	BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));		\
++									\
++	(size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64));	\
++})
++
++#define vstruct_bytes(_s)						\
++	__vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
++
++#define __vstruct_blocks(_type, _sector_block_bits, _u64s)		\
++	(round_up(__vstruct_bytes(_type, _u64s),			\
++		  512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
++
++#define vstruct_blocks(_s, _sector_block_bits)				\
++	__vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
++
++#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s)		\
++	__vstruct_blocks(typeof(*(_s)), _sector_block_bits,		\
++			 __vstruct_u64s(_s) + (_u64s))
++
++#define vstruct_sectors(_s, _sector_block_bits)				\
++	(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
++
++#define vstruct_next(_s)						\
++	((typeof(_s))			((_s)->_data + __vstruct_u64s(_s)))
++#define vstruct_last(_s)						\
++	((typeof(&(_s)->start[0]))	((_s)->_data + __vstruct_u64s(_s)))
++#define vstruct_end(_s)							\
++	((void *)			((_s)->_data + __vstruct_u64s(_s)))
++
++#define vstruct_for_each(_s, _i)					\
++	for (_i = (_s)->start;						\
++	     _i < vstruct_last(_s);					\
++	     _i = vstruct_next(_i))
++
++#define vstruct_for_each_safe(_s, _i, _t)				\
++	for (_i = (_s)->start;						\
++	     _i < vstruct_last(_s) && (_t = vstruct_next(_i), true);	\
++	     _i = _t)
++
++#define vstruct_idx(_s, _idx)						\
++	((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
++
++#endif /* _VSTRUCTS_H */
+diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
+new file mode 100644
+index 000000000000..186ffab542d5
+--- /dev/null
++++ b/fs/bcachefs/xattr.c
+@@ -0,0 +1,648 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "bkey_methods.h"
++#include "btree_update.h"
++#include "extents.h"
++#include "fs.h"
++#include "rebalance.h"
++#include "str_hash.h"
++#include "xattr.h"
++
++#include <linux/dcache.h>
++#include <linux/posix_acl_xattr.h>
++#include <linux/xattr.h>
++
++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
++
++static u64 bch2_xattr_hash(const struct bch_hash_info *info,
++			  const struct xattr_search_key *key)
++{
++	struct bch_str_hash_ctx ctx;
++
++	bch2_str_hash_init(&ctx, info);
++	bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
++	bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
++
++	return bch2_str_hash_end(&ctx, info);
++}
++
++static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
++{
++	return bch2_xattr_hash(info, key);
++}
++
++static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
++{
++	struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
++
++	return bch2_xattr_hash(info,
++		 &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
++}
++
++static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
++{
++	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
++	const struct xattr_search_key *r = _r;
++
++	return l.v->x_type != r->type ||
++		l.v->x_name_len != r->name.len ||
++		memcmp(l.v->x_name, r->name.name, r->name.len);
++}
++
++static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
++{
++	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
++	struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
++
++	return l.v->x_type != r.v->x_type ||
++		l.v->x_name_len != r.v->x_name_len ||
++		memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
++}
++
++const struct bch_hash_desc bch2_xattr_hash_desc = {
++	.btree_id	= BTREE_ID_xattrs,
++	.key_type	= KEY_TYPE_xattr,
++	.hash_key	= xattr_hash_key,
++	.hash_bkey	= xattr_hash_bkey,
++	.cmp_key	= xattr_cmp_key,
++	.cmp_bkey	= xattr_cmp_bkey,
++};
++
++int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
++		       int rw, struct printbuf *err)
++{
++	const struct xattr_handler *handler;
++	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
++
++	if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) {
++		prt_printf(err, "incorrect value size (%zu < %zu)",
++		       bkey_val_bytes(k.k), sizeof(*xattr.v));
++		return -EINVAL;
++	}
++
++	if (bkey_val_u64s(k.k) <
++	    xattr_val_u64s(xattr.v->x_name_len,
++			   le16_to_cpu(xattr.v->x_val_len))) {
++		prt_printf(err, "value too small (%zu < %u)",
++		       bkey_val_u64s(k.k),
++		       xattr_val_u64s(xattr.v->x_name_len,
++				      le16_to_cpu(xattr.v->x_val_len)));
++		return -EINVAL;
++	}
++
++	/* XXX why +4 ? */
++	if (bkey_val_u64s(k.k) >
++	    xattr_val_u64s(xattr.v->x_name_len,
++			   le16_to_cpu(xattr.v->x_val_len) + 4)) {
++		prt_printf(err, "value too big (%zu > %u)",
++		       bkey_val_u64s(k.k),
++		       xattr_val_u64s(xattr.v->x_name_len,
++				      le16_to_cpu(xattr.v->x_val_len) + 4));
++		return -EINVAL;
++	}
++
++	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
++	if (!handler) {
++		prt_printf(err, "invalid type (%u)", xattr.v->x_type);
++		return -EINVAL;
++	}
++
++	if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
++		prt_printf(err, "xattr name has invalid characters");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
++			struct bkey_s_c k)
++{
++	const struct xattr_handler *handler;
++	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
++
++	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
++	if (handler && handler->prefix)
++		prt_printf(out, "%s", handler->prefix);
++	else if (handler)
++		prt_printf(out, "(type %u)", xattr.v->x_type);
++	else
++		prt_printf(out, "(unknown type %u)", xattr.v->x_type);
++
++	prt_printf(out, "%.*s:%.*s",
++	       xattr.v->x_name_len,
++	       xattr.v->x_name,
++	       le16_to_cpu(xattr.v->x_val_len),
++	       (char *) xattr_val(xattr.v));
++}
++
++static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
++				const char *name, void *buffer, size_t size, int type)
++{
++	struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
++	struct btree_iter iter;
++	struct bkey_s_c_xattr xattr;
++	struct bkey_s_c k;
++	int ret;
++
++	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
++			       inode_inum(inode),
++			       &X_SEARCH(type, name, strlen(name)),
++			       0);
++	if (ret)
++		goto err1;
++
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err2;
++
++	xattr = bkey_s_c_to_xattr(k);
++	ret = le16_to_cpu(xattr.v->x_val_len);
++	if (buffer) {
++		if (ret > size)
++			ret = -ERANGE;
++		else
++			memcpy(buffer, xattr_val(xattr.v), ret);
++	}
++err2:
++	bch2_trans_iter_exit(trans, &iter);
++err1:
++	return ret == -ENOENT ? -ENODATA : ret;
++}
++
++int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
++		   const char *name, void *buffer, size_t size, int type)
++{
++	return bch2_trans_do(c, NULL, NULL, 0,
++		bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
++}
++
++int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
++		   const struct bch_hash_info *hash_info,
++		   const char *name, const void *value, size_t size,
++		   int type, int flags)
++{
++	struct btree_iter inode_iter = { NULL };
++	struct bch_inode_unpacked inode_u;
++	int ret;
++
++	/*
++	 * We need to do an inode update so that bi_journal_sync gets updated
++	 * and fsync works:
++	 *
++	 * Perhaps we should be updating bi_mtime too?
++	 */
++
++	ret   = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
++		bch2_inode_write(trans, &inode_iter, &inode_u);
++	bch2_trans_iter_exit(trans, &inode_iter);
++
++	if (ret)
++		return ret;
++
++	if (value) {
++		struct bkey_i_xattr *xattr;
++		unsigned namelen = strlen(name);
++		unsigned u64s = BKEY_U64s +
++			xattr_val_u64s(namelen, size);
++
++		if (u64s > U8_MAX)
++			return -ERANGE;
++
++		xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
++		if (IS_ERR(xattr))
++			return PTR_ERR(xattr);
++
++		bkey_xattr_init(&xattr->k_i);
++		xattr->k.u64s		= u64s;
++		xattr->v.x_type		= type;
++		xattr->v.x_name_len	= namelen;
++		xattr->v.x_val_len	= cpu_to_le16(size);
++		memcpy(xattr->v.x_name, name, namelen);
++		memcpy(xattr_val(&xattr->v), value, size);
++
++		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
++			      inum, &xattr->k_i,
++			      (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
++			      (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
++	} else {
++		struct xattr_search_key search =
++			X_SEARCH(type, name, strlen(name));
++
++		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
++				       hash_info, inum, &search);
++	}
++
++	if (ret == -ENOENT)
++		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
++
++	return ret;
++}
++
++struct xattr_buf {
++	char		*buf;
++	size_t		len;
++	size_t		used;
++};
++
++static int __bch2_xattr_emit(const char *prefix,
++			     const char *name, size_t name_len,
++			     struct xattr_buf *buf)
++{
++	const size_t prefix_len = strlen(prefix);
++	const size_t total_len = prefix_len + name_len + 1;
++
++	if (buf->buf) {
++		if (buf->used + total_len > buf->len)
++			return -ERANGE;
++
++		memcpy(buf->buf + buf->used, prefix, prefix_len);
++		memcpy(buf->buf + buf->used + prefix_len,
++		       name, name_len);
++		buf->buf[buf->used + prefix_len + name_len] = '\0';
++	}
++
++	buf->used += total_len;
++	return 0;
++}
++
++static int bch2_xattr_emit(struct dentry *dentry,
++			    const struct bch_xattr *xattr,
++			    struct xattr_buf *buf)
++{
++	const struct xattr_handler *handler =
++		bch2_xattr_type_to_handler(xattr->x_type);
++
++	return handler && (!handler->list || handler->list(dentry))
++		? __bch2_xattr_emit(handler->prefix ?: handler->name,
++				    xattr->x_name, xattr->x_name_len, buf)
++		: 0;
++}
++
++static int bch2_xattr_list_bcachefs(struct bch_fs *c,
++				    struct bch_inode_unpacked *inode,
++				    struct xattr_buf *buf,
++				    bool all)
++{
++	const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
++	unsigned id;
++	int ret = 0;
++	u64 v;
++
++	for (id = 0; id < Inode_opt_nr; id++) {
++		v = bch2_inode_opt_get(inode, id);
++		if (!v)
++			continue;
++
++		if (!all &&
++		    !(inode->bi_fields_set & (1 << id)))
++			continue;
++
++		ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
++					strlen(bch2_inode_opts[id]), buf);
++		if (ret)
++			break;
++	}
++
++	return ret;
++}
++
++ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
++{
++	struct bch_fs *c = dentry->d_sb->s_fs_info;
++	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
++	u64 offset = 0, inum = inode->ei_inode.bi_inum;
++	u32 snapshot;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++	iter = (struct btree_iter) { NULL };
++
++	ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
++			   SPOS(inum, offset, snapshot),
++			   POS(inum, U64_MAX), 0, k, ret) {
++		if (k.k->type != KEY_TYPE_xattr)
++			continue;
++
++		ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
++		if (ret)
++			break;
++	}
++
++	offset = iter.pos.offset;
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		return ret;
++
++	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
++	if (ret)
++		return ret;
++
++	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
++	if (ret)
++		return ret;
++
++	return buf.used;
++}
++
++static int bch2_xattr_get_handler(const struct xattr_handler *handler,
++				  struct dentry *dentry, struct inode *vinode,
++				  const char *name, void *buffer, size_t size)
++{
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++
++	return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
++}
++
++static int bch2_xattr_set_handler(const struct xattr_handler *handler,
++				  struct user_namespace *mnt_userns,
++				  struct dentry *dentry, struct inode *vinode,
++				  const char *name, const void *value,
++				  size_t size, int flags)
++{
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
++
++	return bch2_trans_do(c, NULL, NULL, 0,
++			bch2_xattr_set(&trans, inode_inum(inode), &hash,
++				       name, value, size,
++				       handler->flags, flags));
++}
++
++static const struct xattr_handler bch_xattr_user_handler = {
++	.prefix	= XATTR_USER_PREFIX,
++	.get	= bch2_xattr_get_handler,
++	.set	= bch2_xattr_set_handler,
++	.flags	= KEY_TYPE_XATTR_INDEX_USER,
++};
++
++static bool bch2_xattr_trusted_list(struct dentry *dentry)
++{
++	return capable(CAP_SYS_ADMIN);
++}
++
++static const struct xattr_handler bch_xattr_trusted_handler = {
++	.prefix	= XATTR_TRUSTED_PREFIX,
++	.list	= bch2_xattr_trusted_list,
++	.get	= bch2_xattr_get_handler,
++	.set	= bch2_xattr_set_handler,
++	.flags	= KEY_TYPE_XATTR_INDEX_TRUSTED,
++};
++
++static const struct xattr_handler bch_xattr_security_handler = {
++	.prefix	= XATTR_SECURITY_PREFIX,
++	.get	= bch2_xattr_get_handler,
++	.set	= bch2_xattr_set_handler,
++	.flags	= KEY_TYPE_XATTR_INDEX_SECURITY,
++};
++
++#ifndef NO_BCACHEFS_FS
++
++static int opt_to_inode_opt(int id)
++{
++	switch (id) {
++#define x(name, ...)				\
++	case Opt_##name: return Inode_opt_##name;
++	BCH_INODE_OPTS()
++#undef  x
++	default:
++		return -1;
++	}
++}
++
++static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
++				struct dentry *dentry, struct inode *vinode,
++				const char *name, void *buffer, size_t size,
++				bool all)
++{
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_opts opts =
++		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
++	const struct bch_option *opt;
++	int id, inode_opt_id;
++	struct printbuf out = PRINTBUF;
++	int ret;
++	u64 v;
++
++	id = bch2_opt_lookup(name);
++	if (id < 0 || !bch2_opt_is_inode_opt(id))
++		return -EINVAL;
++
++	inode_opt_id = opt_to_inode_opt(id);
++	if (inode_opt_id < 0)
++		return -EINVAL;
++
++	opt = bch2_opt_table + id;
++
++	if (!bch2_opt_defined_by_id(&opts, id))
++		return -ENODATA;
++
++	if (!all &&
++	    !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
++		return -ENODATA;
++
++	v = bch2_opt_get_by_id(&opts, id);
++	bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
++
++	ret = out.pos;
++
++	if (out.allocation_failure) {
++		ret = -ENOMEM;
++	} else if (buffer) {
++		if (out.pos > size)
++			ret = -ERANGE;
++		else
++			memcpy(buffer, out.buf, out.pos);
++	}
++
++	printbuf_exit(&out);
++	return ret;
++}
++
++static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
++				   struct dentry *dentry, struct inode *vinode,
++				   const char *name, void *buffer, size_t size)
++{
++	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
++					 name, buffer, size, false);
++}
++
++struct inode_opt_set {
++	int			id;
++	u64			v;
++	bool			defined;
++};
++
++static int inode_opt_set_fn(struct bch_inode_info *inode,
++			    struct bch_inode_unpacked *bi,
++			    void *p)
++{
++	struct inode_opt_set *s = p;
++
++	if (s->defined)
++		bi->bi_fields_set |= 1U << s->id;
++	else
++		bi->bi_fields_set &= ~(1U << s->id);
++
++	bch2_inode_opt_set(bi, s->id, s->v);
++
++	return 0;
++}
++
++static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
++				   struct user_namespace *mnt_userns,
++				   struct dentry *dentry, struct inode *vinode,
++				   const char *name, const void *value,
++				   size_t size, int flags)
++{
++	struct bch_inode_info *inode = to_bch_ei(vinode);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	const struct bch_option *opt;
++	char *buf;
++	struct inode_opt_set s;
++	int opt_id, inode_opt_id, ret;
++
++	opt_id = bch2_opt_lookup(name);
++	if (opt_id < 0)
++		return -EINVAL;
++
++	opt = bch2_opt_table + opt_id;
++
++	inode_opt_id = opt_to_inode_opt(opt_id);
++	if (inode_opt_id < 0)
++		return -EINVAL;
++
++	s.id = inode_opt_id;
++
++	if (value) {
++		u64 v = 0;
++
++		buf = kmalloc(size + 1, GFP_KERNEL);
++		if (!buf)
++			return -ENOMEM;
++		memcpy(buf, value, size);
++		buf[size] = '\0';
++
++		ret = bch2_opt_parse(c, opt, buf, &v, NULL);
++		kfree(buf);
++
++		if (ret < 0)
++			return ret;
++
++		ret = bch2_opt_check_may_set(c, opt_id, v);
++		if (ret < 0)
++			return ret;
++
++		s.v = v + 1;
++		s.defined = true;
++	} else {
++		if (!IS_ROOT(dentry)) {
++			struct bch_inode_info *dir =
++				to_bch_ei(d_inode(dentry->d_parent));
++
++			s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
++		} else {
++			s.v = 0;
++		}
++
++		s.defined = false;
++	}
++
++	mutex_lock(&inode->ei_update_lock);
++	if (inode_opt_id == Inode_opt_project) {
++		/*
++		 * inode fields accessible via the xattr interface are stored
++		 * with a +1 bias, so that 0 means unset:
++		 */
++		ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
++		if (ret)
++			goto err;
++	}
++
++	ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
++err:
++	mutex_unlock(&inode->ei_update_lock);
++
++	if (value &&
++	    (opt_id == Opt_background_compression ||
++	     opt_id == Opt_background_target))
++		bch2_rebalance_add_work(c, inode->v.i_blocks);
++
++	return ret;
++}
++
++static const struct xattr_handler bch_xattr_bcachefs_handler = {
++	.prefix	= "bcachefs.",
++	.get	= bch2_xattr_bcachefs_get,
++	.set	= bch2_xattr_bcachefs_set,
++};
++
++static int bch2_xattr_bcachefs_get_effective(
++				const struct xattr_handler *handler,
++				struct dentry *dentry, struct inode *vinode,
++				const char *name, void *buffer, size_t size)
++{
++	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
++					 name, buffer, size, true);
++}
++
++static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
++	.prefix	= "bcachefs_effective.",
++	.get	= bch2_xattr_bcachefs_get_effective,
++	.set	= bch2_xattr_bcachefs_set,
++};
++
++#endif /* NO_BCACHEFS_FS */
++
++const struct xattr_handler *bch2_xattr_handlers[] = {
++	&bch_xattr_user_handler,
++#ifdef CONFIG_BCACHEFS_POSIX_ACL
++	&posix_acl_access_xattr_handler,
++	&posix_acl_default_xattr_handler,
++#endif
++	&bch_xattr_trusted_handler,
++	&bch_xattr_security_handler,
++#ifndef NO_BCACHEFS_FS
++	&bch_xattr_bcachefs_handler,
++	&bch_xattr_bcachefs_effective_handler,
++#endif
++	NULL
++};
++
++static const struct xattr_handler *bch_xattr_handler_map[] = {
++	[KEY_TYPE_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
++	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS]	=
++		&posix_acl_access_xattr_handler,
++	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
++		&posix_acl_default_xattr_handler,
++	[KEY_TYPE_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
++	[KEY_TYPE_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
++};
++
++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
++{
++	return type < ARRAY_SIZE(bch_xattr_handler_map)
++		? bch_xattr_handler_map[type]
++		: NULL;
++}
+diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
+new file mode 100644
+index 000000000000..66d7a1e30350
+--- /dev/null
++++ b/fs/bcachefs/xattr.h
+@@ -0,0 +1,50 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_XATTR_H
++#define _BCACHEFS_XATTR_H
++
++#include "str_hash.h"
++
++extern const struct bch_hash_desc bch2_xattr_hash_desc;
++
++int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++
++#define bch2_bkey_ops_xattr (struct bkey_ops) {		\
++	.key_invalid	= bch2_xattr_invalid,		\
++	.val_to_text	= bch2_xattr_to_text,		\
++}
++
++static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
++{
++	return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
++			    name_len + val_len, sizeof(u64));
++}
++
++#define xattr_val(_xattr)					\
++	((void *) (_xattr)->x_name + (_xattr)->x_name_len)
++
++struct xattr_search_key {
++	u8		type;
++	struct qstr	name;
++};
++
++#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)	\
++	{ .type = _type, .name = QSTR_INIT(_name, _len) })
++
++struct dentry;
++struct xattr_handler;
++struct bch_hash_info;
++struct bch_inode_info;
++
++int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
++		  const char *, void *, size_t, int);
++
++int bch2_xattr_set(struct btree_trans *, subvol_inum,
++		   const struct bch_hash_info *,
++		   const char *, const void *, size_t, int, int);
++
++ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
++
++extern const struct xattr_handler *bch2_xattr_handlers[];
++
++#endif /* _BCACHEFS_XATTR_H */
+diff --git a/fs/d_path.c b/fs/d_path.c
+index e4e0ebad1f15..1bd9e85f2f65 100644
+--- a/fs/d_path.c
++++ b/fs/d_path.c
+@@ -5,6 +5,7 @@
+ #include <linux/fs_struct.h>
+ #include <linux/fs.h>
+ #include <linux/slab.h>
++#include <linux/printbuf.h>
+ #include <linux/prefetch.h>
+ #include "mount.h"
+ 
+@@ -294,6 +295,40 @@ char *d_path(const struct path *path, char *buf, int buflen)
+ }
+ EXPORT_SYMBOL(d_path);
+ 
++/**
++ * prt_path - format a path for output
++ * @out: printbuf to output to
++ * @path: path to write into the sequence buffer.
++ * @esc: set of characters to escape in the output
++ *
++ * Write a path name into the sequence buffer.
++ *
++ * Returns 0 on success, or error code from d_path
++ */
++int prt_path(struct printbuf *out, const struct path *path, const char *esc)
++{
++	char *p, *buf;
++	size_t size;
++again:
++	buf = out->buf + out->pos;
++	size = printbuf_remaining_size(out);
++
++	p = d_path(path, buf, size);
++	if (IS_ERR(p)) {
++		printbuf_make_room(out, max_t(size_t, 64, size * 2));
++		if (printbuf_remaining_size(out) > size)
++			goto again;
++
++		return PTR_ERR(p);
++	}
++
++	p = mangle_path(buf, p, esc);
++	if (p)
++		out->pos += p - buf;
++	return 0;
++}
++EXPORT_SYMBOL(prt_path);
++
+ /*
+  * Helper function for dentry_operations.d_dname() members
+  */
+diff --git a/fs/dcache.c b/fs/dcache.c
+index 93f4f5ee07bf..d90ed65e2a75 100644
+--- a/fs/dcache.c
++++ b/fs/dcache.c
+@@ -3193,9 +3193,8 @@ void d_genocide(struct dentry *parent)
+ 
+ EXPORT_SYMBOL(d_genocide);
+ 
+-void d_tmpfile(struct dentry *dentry, struct inode *inode)
++void d_mark_tmpfile(struct dentry *dentry, struct inode *inode)
+ {
+-	inode_dec_link_count(inode);
+ 	BUG_ON(dentry->d_name.name != dentry->d_iname ||
+ 		!hlist_unhashed(&dentry->d_u.d_alias) ||
+ 		!d_unlinked(dentry));
+@@ -3205,6 +3204,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode)
+ 				(unsigned long long)inode->i_ino);
+ 	spin_unlock(&dentry->d_lock);
+ 	spin_unlock(&dentry->d_parent->d_lock);
++}
++EXPORT_SYMBOL(d_mark_tmpfile);
++
++void d_tmpfile(struct dentry *dentry, struct inode *inode)
++{
++	inode_dec_link_count(inode);
++	d_mark_tmpfile(dentry, inode);
+ 	d_instantiate(dentry, inode);
+ }
+ EXPORT_SYMBOL(d_tmpfile);
+diff --git a/fs/inode.c b/fs/inode.c
+index bd4da9c5207e..ac0da28a1ac6 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -56,8 +56,23 @@
+ 
+ static unsigned int i_hash_mask __read_mostly;
+ static unsigned int i_hash_shift __read_mostly;
+-static struct hlist_head *inode_hashtable __read_mostly;
+-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
++static struct hlist_bl_head *inode_hashtable __read_mostly;
++
++static unsigned long hash(struct super_block *sb, unsigned long hashval)
++{
++	unsigned long tmp;
++
++	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
++			L1_CACHE_BYTES;
++	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
++	return tmp & i_hash_mask;
++}
++
++static inline struct hlist_bl_head *i_hash_head(struct super_block *sb,
++		unsigned int hashval)
++{
++	return inode_hashtable + hash(sb, hashval);
++}
+ 
+ /*
+  * Empty aops. Can be used for the cases where the user does not
+@@ -417,7 +432,7 @@ EXPORT_SYMBOL(address_space_init_once);
+ void inode_init_once(struct inode *inode)
+ {
+ 	memset(inode, 0, sizeof(*inode));
+-	INIT_HLIST_NODE(&inode->i_hash);
++	INIT_HLIST_BL_NODE(&inode->i_hash);
+ 	INIT_LIST_HEAD(&inode->i_devices);
+ 	INIT_LIST_HEAD(&inode->i_io_list);
+ 	INIT_LIST_HEAD(&inode->i_wb_list);
+@@ -505,14 +520,15 @@ static inline void inode_sb_list_del(struct inode *inode)
+ 	}
+ }
+ 
+-static unsigned long hash(struct super_block *sb, unsigned long hashval)
++/*
++ * Ensure that we store the hash head in the inode when we insert the inode into
++ * the hlist_bl_head...
++ */
++static inline void
++__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b)
+ {
+-	unsigned long tmp;
+-
+-	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+-			L1_CACHE_BYTES;
+-	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
+-	return tmp & i_hash_mask;
++	hlist_bl_add_head_rcu(&inode->i_hash, b);
++	inode->i_hash_head = b;
+ }
+ 
+ /**
+@@ -525,13 +541,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
+  */
+ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+ {
+-	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
++	struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
+ 
+-	spin_lock(&inode_hash_lock);
++	hlist_bl_lock(b);
+ 	spin_lock(&inode->i_lock);
+-	hlist_add_head_rcu(&inode->i_hash, b);
++	__insert_inode_hash_head(inode, b);
+ 	spin_unlock(&inode->i_lock);
+-	spin_unlock(&inode_hash_lock);
++	hlist_bl_unlock(b);
+ }
+ EXPORT_SYMBOL(__insert_inode_hash);
+ 
+@@ -543,11 +559,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
+  */
+ void __remove_inode_hash(struct inode *inode)
+ {
+-	spin_lock(&inode_hash_lock);
+-	spin_lock(&inode->i_lock);
+-	hlist_del_init_rcu(&inode->i_hash);
+-	spin_unlock(&inode->i_lock);
+-	spin_unlock(&inode_hash_lock);
++	struct hlist_bl_head *b = inode->i_hash_head;
++
++	/*
++	 * There are some callers that come through here without synchronisation
++	 * and potentially with multiple references to the inode. Hence we have
++	 * to handle the case that we might race with a remove and insert to a
++	 * different list. Coda, in particular, seems to have a userspace API
++	 * that can directly trigger "unhash/rehash to different list" behaviour
++	 * without any serialisation at all.
++	 *
++	 * Hence we have to handle the situation where the inode->i_hash_head
++	 * might point to a different list than what we expect, indicating that
++	 * we raced with another unhash and potentially a new insertion. This
++	 * means we have to retest the head once we have everything locked up
++	 * and loop again if it doesn't match.
++	 */
++	while (b) {
++		hlist_bl_lock(b);
++		spin_lock(&inode->i_lock);
++		if (b != inode->i_hash_head) {
++			hlist_bl_unlock(b);
++			b = inode->i_hash_head;
++			spin_unlock(&inode->i_lock);
++			continue;
++		}
++		/*
++		 * Need to set the pprev pointer to NULL after list removal so
++		 * that both RCU traversals and hlist_bl_unhashed() work
++		 * correctly at this point.
++		 */
++		hlist_bl_del_rcu(&inode->i_hash);
++		inode->i_hash.pprev = NULL;
++		inode->i_hash_head = NULL;
++		spin_unlock(&inode->i_lock);
++		hlist_bl_unlock(b);
++		break;
++	}
++
+ }
+ EXPORT_SYMBOL(__remove_inode_hash);
+ 
+@@ -897,26 +946,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
+ 	return freed;
+ }
+ 
+-static void __wait_on_freeing_inode(struct inode *inode);
++static void __wait_on_freeing_inode(struct hlist_bl_head *b,
++				struct inode *inode);
+ /*
+  * Called with the inode lock held.
+  */
+ static struct inode *find_inode(struct super_block *sb,
+-				struct hlist_head *head,
++				struct hlist_bl_head *b,
+ 				int (*test)(struct inode *, void *),
+ 				void *data)
+ {
++	struct hlist_bl_node *node;
+ 	struct inode *inode = NULL;
+ 
+ repeat:
+-	hlist_for_each_entry(inode, head, i_hash) {
++	hlist_bl_for_each_entry(inode, node, b, i_hash) {
+ 		if (inode->i_sb != sb)
+ 			continue;
+ 		if (!test(inode, data))
+ 			continue;
+ 		spin_lock(&inode->i_lock);
+ 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+-			__wait_on_freeing_inode(inode);
++			__wait_on_freeing_inode(b, inode);
+ 			goto repeat;
+ 		}
+ 		if (unlikely(inode->i_state & I_CREATING)) {
+@@ -935,19 +986,20 @@ static struct inode *find_inode(struct super_block *sb,
+  * iget_locked for details.
+  */
+ static struct inode *find_inode_fast(struct super_block *sb,
+-				struct hlist_head *head, unsigned long ino)
++				struct hlist_bl_head *b, unsigned long ino)
+ {
++	struct hlist_bl_node *node;
+ 	struct inode *inode = NULL;
+ 
+ repeat:
+-	hlist_for_each_entry(inode, head, i_hash) {
++	hlist_bl_for_each_entry(inode, node, b, i_hash) {
+ 		if (inode->i_ino != ino)
+ 			continue;
+ 		if (inode->i_sb != sb)
+ 			continue;
+ 		spin_lock(&inode->i_lock);
+ 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+-			__wait_on_freeing_inode(inode);
++			__wait_on_freeing_inode(b, inode);
+ 			goto repeat;
+ 		}
+ 		if (unlikely(inode->i_state & I_CREATING)) {
+@@ -1156,26 +1208,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
+  * return it locked, hashed, and with the I_NEW flag set. The file system gets
+  * to fill it in before unlocking it via unlock_new_inode().
+  *
+- * Note both @test and @set are called with the inode_hash_lock held, so can't
+- * sleep.
++ * Note both @test and @set are called with the inode hash chain lock held,
++ * so can't sleep.
+  */
+ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+ 			    int (*test)(struct inode *, void *),
+ 			    int (*set)(struct inode *, void *), void *data)
+ {
+-	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
++	struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
+ 	struct inode *old;
+ 	bool creating = inode->i_state & I_CREATING;
+ 
+ again:
+-	spin_lock(&inode_hash_lock);
+-	old = find_inode(inode->i_sb, head, test, data);
++	hlist_bl_lock(b);
++	old = find_inode(inode->i_sb, b, test, data);
+ 	if (unlikely(old)) {
+ 		/*
+ 		 * Uhhuh, somebody else created the same inode under us.
+ 		 * Use the old inode instead of the preallocated one.
+ 		 */
+-		spin_unlock(&inode_hash_lock);
++		hlist_bl_unlock(b);
+ 		if (IS_ERR(old))
+ 			return NULL;
+ 		wait_on_inode(old);
+@@ -1197,12 +1249,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+ 	 */
+ 	spin_lock(&inode->i_lock);
+ 	inode->i_state |= I_NEW;
+-	hlist_add_head_rcu(&inode->i_hash, head);
++	__insert_inode_hash_head(inode, b);
+ 	spin_unlock(&inode->i_lock);
+ 	if (!creating)
+ 		inode_sb_list_add(inode);
+ unlock:
+-	spin_unlock(&inode_hash_lock);
++	hlist_bl_unlock(b);
+ 
+ 	return inode;
+ }
+@@ -1263,12 +1315,12 @@ EXPORT_SYMBOL(iget5_locked);
+  */
+ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+ {
+-	struct hlist_head *head = inode_hashtable + hash(sb, ino);
++	struct hlist_bl_head *b = i_hash_head(sb, ino);
+ 	struct inode *inode;
+ again:
+-	spin_lock(&inode_hash_lock);
+-	inode = find_inode_fast(sb, head, ino);
+-	spin_unlock(&inode_hash_lock);
++	hlist_bl_lock(b);
++	inode = find_inode_fast(sb, b, ino);
++	hlist_bl_unlock(b);
+ 	if (inode) {
+ 		if (IS_ERR(inode))
+ 			return NULL;
+@@ -1284,17 +1336,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+ 	if (inode) {
+ 		struct inode *old;
+ 
+-		spin_lock(&inode_hash_lock);
++		hlist_bl_lock(b);
+ 		/* We released the lock, so.. */
+-		old = find_inode_fast(sb, head, ino);
++		old = find_inode_fast(sb, b, ino);
+ 		if (!old) {
+ 			inode->i_ino = ino;
+ 			spin_lock(&inode->i_lock);
+ 			inode->i_state = I_NEW;
+-			hlist_add_head_rcu(&inode->i_hash, head);
++			__insert_inode_hash_head(inode, b);
+ 			spin_unlock(&inode->i_lock);
+ 			inode_sb_list_add(inode);
+-			spin_unlock(&inode_hash_lock);
++			hlist_bl_unlock(b);
+ 
+ 			/* Return the locked inode with I_NEW set, the
+ 			 * caller is responsible for filling in the contents
+@@ -1307,7 +1359,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+ 		 * us. Use the old inode instead of the one we just
+ 		 * allocated.
+ 		 */
+-		spin_unlock(&inode_hash_lock);
++		hlist_bl_unlock(b);
+ 		destroy_inode(inode);
+ 		if (IS_ERR(old))
+ 			return NULL;
+@@ -1331,10 +1383,11 @@ EXPORT_SYMBOL(iget_locked);
+  */
+ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+ {
+-	struct hlist_head *b = inode_hashtable + hash(sb, ino);
++	struct hlist_bl_head *b = i_hash_head(sb, ino);
++	struct hlist_bl_node *node;
+ 	struct inode *inode;
+ 
+-	hlist_for_each_entry_rcu(inode, b, i_hash) {
++	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
+ 		if (inode->i_ino == ino && inode->i_sb == sb)
+ 			return 0;
+ 	}
+@@ -1418,12 +1471,12 @@ EXPORT_SYMBOL(igrab);
+ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
+ 		int (*test)(struct inode *, void *), void *data)
+ {
+-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
++	struct hlist_bl_head *b = i_hash_head(sb, hashval);
+ 	struct inode *inode;
+ 
+-	spin_lock(&inode_hash_lock);
+-	inode = find_inode(sb, head, test, data);
+-	spin_unlock(&inode_hash_lock);
++	hlist_bl_lock(b);
++	inode = find_inode(sb, b, test, data);
++	hlist_bl_unlock(b);
+ 
+ 	return IS_ERR(inode) ? NULL : inode;
+ }
+@@ -1473,12 +1526,12 @@ EXPORT_SYMBOL(ilookup5);
+  */
+ struct inode *ilookup(struct super_block *sb, unsigned long ino)
+ {
+-	struct hlist_head *head = inode_hashtable + hash(sb, ino);
++	struct hlist_bl_head *b = i_hash_head(sb, ino);
+ 	struct inode *inode;
+ again:
+-	spin_lock(&inode_hash_lock);
+-	inode = find_inode_fast(sb, head, ino);
+-	spin_unlock(&inode_hash_lock);
++	hlist_bl_lock(b);
++	inode = find_inode_fast(sb, b, ino);
++	hlist_bl_unlock(b);
+ 
+ 	if (inode) {
+ 		if (IS_ERR(inode))
+@@ -1522,12 +1575,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
+ 					     void *),
+ 				void *data)
+ {
+-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
++	struct hlist_bl_head *b = i_hash_head(sb, hashval);
++	struct hlist_bl_node *node;
+ 	struct inode *inode, *ret_inode = NULL;
+ 	int mval;
+ 
+-	spin_lock(&inode_hash_lock);
+-	hlist_for_each_entry(inode, head, i_hash) {
++	hlist_bl_lock(b);
++	hlist_bl_for_each_entry(inode, node, b, i_hash) {
+ 		if (inode->i_sb != sb)
+ 			continue;
+ 		mval = match(inode, hashval, data);
+@@ -1538,7 +1592,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
+ 		goto out;
+ 	}
+ out:
+-	spin_unlock(&inode_hash_lock);
++	hlist_bl_unlock(b);
+ 	return ret_inode;
+ }
+ EXPORT_SYMBOL(find_inode_nowait);
+@@ -1567,13 +1621,14 @@ EXPORT_SYMBOL(find_inode_nowait);
+ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
+ 			     int (*test)(struct inode *, void *), void *data)
+ {
+-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
++	struct hlist_bl_head *b = i_hash_head(sb, hashval);
++	struct hlist_bl_node *node;
+ 	struct inode *inode;
+ 
+ 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+ 			 "suspicious find_inode_rcu() usage");
+ 
+-	hlist_for_each_entry_rcu(inode, head, i_hash) {
++	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
+ 		if (inode->i_sb == sb &&
+ 		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
+ 		    test(inode, data))
+@@ -1605,13 +1660,14 @@ EXPORT_SYMBOL(find_inode_rcu);
+ struct inode *find_inode_by_ino_rcu(struct super_block *sb,
+ 				    unsigned long ino)
+ {
+-	struct hlist_head *head = inode_hashtable + hash(sb, ino);
++	struct hlist_bl_head *b = i_hash_head(sb, ino);
++	struct hlist_bl_node *node;
+ 	struct inode *inode;
+ 
+ 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+ 			 "suspicious find_inode_by_ino_rcu() usage");
+ 
+-	hlist_for_each_entry_rcu(inode, head, i_hash) {
++	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
+ 		if (inode->i_ino == ino &&
+ 		    inode->i_sb == sb &&
+ 		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
+@@ -1625,39 +1681,42 @@ int insert_inode_locked(struct inode *inode)
+ {
+ 	struct super_block *sb = inode->i_sb;
+ 	ino_t ino = inode->i_ino;
+-	struct hlist_head *head = inode_hashtable + hash(sb, ino);
++	struct hlist_bl_head *b = i_hash_head(sb, ino);
+ 
+ 	while (1) {
+-		struct inode *old = NULL;
+-		spin_lock(&inode_hash_lock);
+-		hlist_for_each_entry(old, head, i_hash) {
+-			if (old->i_ino != ino)
++		struct hlist_bl_node *node;
++		struct inode *old = NULL, *t;
++
++		hlist_bl_lock(b);
++		hlist_bl_for_each_entry(t, node, b, i_hash) {
++			if (t->i_ino != ino)
+ 				continue;
+-			if (old->i_sb != sb)
++			if (t->i_sb != sb)
+ 				continue;
+-			spin_lock(&old->i_lock);
+-			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+-				spin_unlock(&old->i_lock);
++			spin_lock(&t->i_lock);
++			if (t->i_state & (I_FREEING|I_WILL_FREE)) {
++				spin_unlock(&t->i_lock);
+ 				continue;
+ 			}
++			old = t;
+ 			break;
+ 		}
+ 		if (likely(!old)) {
+ 			spin_lock(&inode->i_lock);
+ 			inode->i_state |= I_NEW | I_CREATING;
+-			hlist_add_head_rcu(&inode->i_hash, head);
++			__insert_inode_hash_head(inode, b);
+ 			spin_unlock(&inode->i_lock);
+-			spin_unlock(&inode_hash_lock);
++			hlist_bl_unlock(b);
+ 			return 0;
+ 		}
+ 		if (unlikely(old->i_state & I_CREATING)) {
+ 			spin_unlock(&old->i_lock);
+-			spin_unlock(&inode_hash_lock);
++			hlist_bl_unlock(b);
+ 			return -EBUSY;
+ 		}
+ 		__iget(old);
+ 		spin_unlock(&old->i_lock);
+-		spin_unlock(&inode_hash_lock);
++		hlist_bl_unlock(b);
+ 		wait_on_inode(old);
+ 		if (unlikely(!inode_unhashed(old))) {
+ 			iput(old);
+@@ -2131,17 +2190,18 @@ EXPORT_SYMBOL(inode_needs_sync);
+  * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
+  * will DTRT.
+  */
+-static void __wait_on_freeing_inode(struct inode *inode)
++static void __wait_on_freeing_inode(struct hlist_bl_head *b,
++				struct inode *inode)
+ {
+ 	wait_queue_head_t *wq;
+ 	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+ 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
+ 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ 	spin_unlock(&inode->i_lock);
+-	spin_unlock(&inode_hash_lock);
++	hlist_bl_unlock(b);
+ 	schedule();
+ 	finish_wait(wq, &wait.wq_entry);
+-	spin_lock(&inode_hash_lock);
++	hlist_bl_lock(b);
+ }
+ 
+ static __initdata unsigned long ihash_entries;
+@@ -2167,7 +2227,7 @@ void __init inode_init_early(void)
+ 
+ 	inode_hashtable =
+ 		alloc_large_system_hash("Inode-cache",
+-					sizeof(struct hlist_head),
++					sizeof(struct hlist_bl_head),
+ 					ihash_entries,
+ 					14,
+ 					HASH_EARLY | HASH_ZERO,
+@@ -2193,7 +2253,7 @@ void __init inode_init(void)
+ 
+ 	inode_hashtable =
+ 		alloc_large_system_hash("Inode-cache",
+-					sizeof(struct hlist_head),
++					sizeof(struct hlist_bl_head),
+ 					ihash_entries,
+ 					14,
+ 					HASH_ZERO,
+diff --git a/include/linux/bio.h b/include/linux/bio.h
+index 992ee987f273..6d5acc1b407f 100644
+--- a/include/linux/bio.h
++++ b/include/linux/bio.h
+@@ -480,7 +480,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+ extern void bio_copy_data(struct bio *dst, struct bio *src);
+ extern void bio_free_pages(struct bio *bio);
+ void guard_bio_eod(struct bio *bio);
+-void zero_fill_bio(struct bio *bio);
++void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
++
++static inline void zero_fill_bio(struct bio *bio)
++{
++	zero_fill_bio_iter(bio, bio->bi_iter);
++}
+ 
+ static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
+ {
+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
+index 2f7b43444c5f..4ef515977abc 100644
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -884,6 +884,7 @@ extern const char *blk_op_str(unsigned int op);
+ 
+ int blk_status_to_errno(blk_status_t status);
+ blk_status_t errno_to_blk_status(int errno);
++const char *blk_status_to_str(blk_status_t status);
+ 
+ /* only poll the hardware once, don't continue until a completion was found */
+ #define BLK_POLL_ONESHOT		(1 << 0)
+diff --git a/drivers/md/bcache/closure.h b/include/linux/closure.h
+similarity index 94%
+rename from drivers/md/bcache/closure.h
+rename to include/linux/closure.h
+index c88cdc4ae4ec..36b4a83f9b77 100644
+--- a/drivers/md/bcache/closure.h
++++ b/include/linux/closure.h
+@@ -155,7 +155,7 @@ struct closure {
+ 
+ 	atomic_t		remaining;
+ 
+-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
++#ifdef CONFIG_DEBUG_CLOSURES
+ #define CLOSURE_MAGIC_DEAD	0xc054dead
+ #define CLOSURE_MAGIC_ALIVE	0xc054a11e
+ 
+@@ -184,15 +184,13 @@ static inline void closure_sync(struct closure *cl)
+ 		__closure_sync(cl);
+ }
+ 
+-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
++#ifdef CONFIG_DEBUG_CLOSURES
+ 
+-void closure_debug_init(void);
+ void closure_debug_create(struct closure *cl);
+ void closure_debug_destroy(struct closure *cl);
+ 
+ #else
+ 
+-static inline void closure_debug_init(void) {}
+ static inline void closure_debug_create(struct closure *cl) {}
+ static inline void closure_debug_destroy(struct closure *cl) {}
+ 
+@@ -200,21 +198,21 @@ static inline void closure_debug_destroy(struct closure *cl) {}
+ 
+ static inline void closure_set_ip(struct closure *cl)
+ {
+-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
++#ifdef CONFIG_DEBUG_CLOSURES
+ 	cl->ip = _THIS_IP_;
+ #endif
+ }
+ 
+ static inline void closure_set_ret_ip(struct closure *cl)
+ {
+-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
++#ifdef CONFIG_DEBUG_CLOSURES
+ 	cl->ip = _RET_IP_;
+ #endif
+ }
+ 
+ static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+ {
+-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
++#ifdef CONFIG_DEBUG_CLOSURES
+ 	cl->waiting_on = f;
+ #endif
+ }
+@@ -243,6 +241,7 @@ static inline void closure_queue(struct closure *cl)
+ 	 */
+ 	BUILD_BUG_ON(offsetof(struct closure, fn)
+ 		     != offsetof(struct work_struct, func));
++
+ 	if (wq) {
+ 		INIT_WORK(&cl->work, cl->work.func);
+ 		BUG_ON(!queue_work(wq, &cl->work));
+@@ -255,7 +254,7 @@ static inline void closure_queue(struct closure *cl)
+  */
+ static inline void closure_get(struct closure *cl)
+ {
+-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
++#ifdef CONFIG_DEBUG_CLOSURES
+ 	BUG_ON((atomic_inc_return(&cl->remaining) &
+ 		CLOSURE_REMAINING_MASK) <= 1);
+ #else
+@@ -271,7 +270,7 @@ static inline void closure_get(struct closure *cl)
+  */
+ static inline void closure_init(struct closure *cl, struct closure *parent)
+ {
+-	memset(cl, 0, sizeof(struct closure));
++	cl->fn = NULL;
+ 	cl->parent = parent;
+ 	if (parent)
+ 		closure_get(parent);
+@@ -375,4 +374,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
+ 	continue_at_nobarrier(cl, fn, wq);
+ }
+ 
++#define __closure_wait_event(waitlist, _cond)				\
++do {									\
++	struct closure cl;						\
++									\
++	closure_init_stack(&cl);					\
++									\
++	while (1) {							\
++		closure_wait(waitlist, &cl);				\
++		if (_cond)						\
++			break;						\
++		closure_sync(&cl);					\
++	}								\
++	closure_wake_up(waitlist);					\
++	closure_sync(&cl);						\
++} while (0)
++
++#define closure_wait_event(waitlist, _cond)				\
++do {									\
++	if (!(_cond))							\
++		__closure_wait_event(waitlist, _cond);			\
++} while (0)
++
+ #endif /* _LINUX_CLOSURE_H */
+diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
+index 445e80517cab..57e7d0b94119 100644
+--- a/include/linux/compiler_attributes.h
++++ b/include/linux/compiler_attributes.h
+@@ -371,4 +371,9 @@
+  */
+ #define __weak                          __attribute__((__weak__))
+ 
++/*
++ *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute
++ */
++#define __flatten __attribute__((flatten))
++
+ #endif /* __LINUX_COMPILER_ATTRIBUTES_H */
+diff --git a/include/linux/dcache.h b/include/linux/dcache.h
+index f5bba51480b2..6c661059a55b 100644
+--- a/include/linux/dcache.h
++++ b/include/linux/dcache.h
+@@ -248,6 +248,7 @@ extern struct dentry * d_make_root(struct inode *);
+ /* <clickety>-<click> the ramfs-type tree */
+ extern void d_genocide(struct dentry *);
+ 
++extern void d_mark_tmpfile(struct dentry *, struct inode *);
+ extern void d_tmpfile(struct dentry *, struct inode *);
+ 
+ extern struct dentry *d_find_alias(struct inode *);
+@@ -293,6 +294,7 @@ extern char *d_absolute_path(const struct path *, char *, int);
+ extern char *d_path(const struct path *, char *, int);
+ extern char *dentry_path_raw(const struct dentry *, char *, int);
+ extern char *dentry_path(const struct dentry *, char *, int);
++extern int prt_path(struct printbuf *, const struct path *, const char *);
+ 
+ /* Allocation counts.. */
+ 
+diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
+index fe848901fcc3..5a3cc0e1da9b 100644
+--- a/include/linux/exportfs.h
++++ b/include/linux/exportfs.h
+@@ -98,6 +98,12 @@ enum fid_type {
+ 	 */
+ 	FILEID_FAT_WITH_PARENT = 0x72,
+ 
++	/*
++	 * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
++	 */
++	FILEID_BCACHEFS_WITHOUT_PARENT = 0x80,
++	FILEID_BCACHEFS_WITH_PARENT = 0x81,
++
+ 	/*
+ 	 * 128 bit child FID (struct lu_fid)
+ 	 * 128 bit parent FID (struct lu_fid)
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 9ad5e3520fae..1f7671a674e3 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -630,7 +630,8 @@ struct inode {
+ 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
+ 	unsigned long		dirtied_time_when;
+ 
+-	struct hlist_node	i_hash;
++	struct hlist_bl_node	i_hash;
++	struct hlist_bl_head	*i_hash_head;
+ 	struct list_head	i_io_list;	/* backing dev IO list */
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ 	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
+@@ -696,7 +697,7 @@ static inline unsigned int i_blocksize(const struct inode *node)
+ 
+ static inline int inode_unhashed(struct inode *inode)
+ {
+-	return hlist_unhashed(&inode->i_hash);
++	return hlist_bl_unhashed(&inode->i_hash);
+ }
+ 
+ /*
+@@ -707,7 +708,7 @@ static inline int inode_unhashed(struct inode *inode)
+  */
+ static inline void inode_fake_hash(struct inode *inode)
+ {
+-	hlist_add_fake(&inode->i_hash);
++	hlist_bl_add_fake(&inode->i_hash);
+ }
+ 
+ /*
+@@ -2974,7 +2975,7 @@ static inline void insert_inode_hash(struct inode *inode)
+ extern void __remove_inode_hash(struct inode *);
+ static inline void remove_inode_hash(struct inode *inode)
+ {
+-	if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
++	if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash))
+ 		__remove_inode_hash(inode);
+ }
+ 
+diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
+index 107613f7d792..c74b7376990d 100644
+--- a/include/linux/generic-radix-tree.h
++++ b/include/linux/generic-radix-tree.h
+@@ -38,6 +38,7 @@
+ 
+ #include <asm/page.h>
+ #include <linux/bug.h>
++#include <linux/limits.h>
+ #include <linux/log2.h>
+ #include <linux/math.h>
+ #include <linux/types.h>
+@@ -116,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
+ 
+ #define __genradix_cast(_radix)		(typeof((_radix)->type[0]) *)
+ #define __genradix_obj_size(_radix)	sizeof((_radix)->type[0])
++#define __genradix_objs_per_page(_radix)			\
++	(PAGE_SIZE / sizeof((_radix)->type[0]))
++#define __genradix_page_remainder(_radix)			\
++	(PAGE_SIZE % sizeof((_radix)->type[0]))
++
+ #define __genradix_idx_to_offset(_radix, _idx)			\
+ 	__idx_to_offset(_idx, __genradix_obj_size(_radix))
+ 
+@@ -179,11 +185,35 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
+ #define genradix_iter_peek(_iter, _radix)			\
+ 	(__genradix_cast(_radix)				\
+ 	 __genradix_iter_peek(_iter, &(_radix)->tree,		\
+-			      PAGE_SIZE / __genradix_obj_size(_radix)))
++			__genradix_objs_per_page(_radix)))
++
++void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *,
++				size_t, size_t);
++
++/**
++ * genradix_iter_peek - get first entry at or below iterator's current
++ *			position
++ * @_iter:	a genradix_iter
++ * @_radix:	genradix being iterated over
++ *
++ * If no more entries exist at or below @_iter's current position, returns NULL
++ */
++#define genradix_iter_peek_prev(_iter, _radix)			\
++	(__genradix_cast(_radix)				\
++	 __genradix_iter_peek_prev(_iter, &(_radix)->tree,	\
++			__genradix_objs_per_page(_radix),	\
++			__genradix_obj_size(_radix) +		\
++			__genradix_page_remainder(_radix)))
+ 
+ static inline void __genradix_iter_advance(struct genradix_iter *iter,
+ 					   size_t obj_size)
+ {
++	if (iter->offset + obj_size < iter->offset) {
++		iter->offset	= SIZE_MAX;
++		iter->pos	= SIZE_MAX;
++		return;
++	}
++
+ 	iter->offset += obj_size;
+ 
+ 	if (!is_power_of_2(obj_size) &&
+@@ -196,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
+ #define genradix_iter_advance(_iter, _radix)			\
+ 	__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
+ 
++static inline void __genradix_iter_rewind(struct genradix_iter *iter,
++					  size_t obj_size)
++{
++	if (iter->offset == 0 ||
++	    iter->offset == SIZE_MAX) {
++		iter->offset = SIZE_MAX;
++		return;
++	}
++
++	if ((iter->offset & (PAGE_SIZE - 1)) == 0)
++		iter->offset -= PAGE_SIZE % obj_size;
++
++	iter->offset -= obj_size;
++	iter->pos--;
++}
++
++#define genradix_iter_rewind(_iter, _radix)			\
++	__genradix_iter_rewind(_iter, __genradix_obj_size(_radix))
++
+ #define genradix_for_each_from(_radix, _iter, _p, _start)	\
+ 	for (_iter = genradix_iter_init(_radix, _start);	\
+ 	     (_p = genradix_iter_peek(&_iter, _radix)) != NULL;	\
+@@ -213,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
+ #define genradix_for_each(_radix, _iter, _p)			\
+ 	genradix_for_each_from(_radix, _iter, _p, 0)
+ 
++#define genradix_last_pos(_radix)				\
++	(SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
++
++/**
++ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order
++ * @_radix:	genradix to iterate over
++ * @_iter:	a genradix_iter to track current position
++ * @_p:		pointer to genradix entry type
++ *
++ * On every iteration, @_p will point to the current entry, and @_iter.pos
++ * will be the current entry's index.
++ */
++#define genradix_for_each_reverse(_radix, _iter, _p)		\
++	for (_iter = genradix_iter_init(_radix,	genradix_last_pos(_radix));\
++	     (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\
++	     genradix_iter_rewind(&_iter, _radix))
++
+ int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
+ 
+ /**
+diff --git a/include/linux/kernel.h b/include/linux/kernel.h
+index fe6efb24d151..9ba5a53c6ad5 100644
+--- a/include/linux/kernel.h
++++ b/include/linux/kernel.h
+@@ -202,11 +202,17 @@ static inline void might_fault(void) { }
+ 
+ void do_exit(long error_code) __noreturn;
+ 
++struct printbuf;
++extern void prt_u64_minwidth(struct printbuf *out, u64 num, unsigned width);
++extern void prt_u64(struct printbuf *out, u64 num);
+ extern int num_to_str(char *buf, int size,
+ 		      unsigned long long num, unsigned int width);
+ 
+ /* lib/printf utilities */
+ 
++extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...);
++extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list);
++
+ extern __printf(2, 3) int sprintf(char *buf, const char * fmt, ...);
+ extern __printf(2, 0) int vsprintf(char *buf, const char *, va_list);
+ extern __printf(3, 4)
+@@ -289,6 +295,12 @@ extern int hex_to_bin(unsigned char ch);
+ extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
+ extern char *bin2hex(char *dst, const void *src, size_t count);
+ 
++struct printbuf;
++void prt_hex_bytes(struct printbuf *, const void *, unsigned, unsigned, unsigned);
++void prt_hex_line(struct printbuf *, const void *, size_t, int, int, bool);
++void prt_hex_dump(struct printbuf *, const void *, size_t,
++		  const char *, int, unsigned, unsigned, bool);
++
+ bool mac_pton(const char *s, u8 *mac);
+ 
+ /*
+diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
+index ae1b541446c9..8ee2bf5af131 100644
+--- a/include/linux/list_bl.h
++++ b/include/linux/list_bl.h
+@@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
+ 	}
+ }
+ 
++/**
++ * hlist_bl_add_fake - create a fake list consisting of a single headless node
++ * @n: Node to make a fake list out of
++ *
++ * This makes @n appear to be its own predecessor on a headless hlist.
++ * The point of this is to allow things like hlist_bl_del() to work correctly
++ * in cases where there is no list.
++ */
++static inline void hlist_bl_add_fake(struct hlist_bl_node *n)
++{
++	n->pprev = &n->next;
++}
++
++/**
++ * hlist_fake: Is this node a fake hlist_bl?
++ * @h: Node to check for being a self-referential fake hlist.
++ */
++static inline bool hlist_bl_fake(struct hlist_bl_node *n)
++{
++	return n->pprev == &n->next;
++}
++
+ static inline void hlist_bl_lock(struct hlist_bl_head *b)
+ {
+ 	bit_spin_lock(0, (unsigned long *)b);
+diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
+index b6829b970093..5b90b2abd326 100644
+--- a/include/linux/lockdep.h
++++ b/include/linux/lockdep.h
+@@ -335,6 +335,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
+ #define lockdep_repin_lock(l,c)	lock_repin_lock(&(l)->dep_map, (c))
+ #define lockdep_unpin_lock(l,c)	lock_unpin_lock(&(l)->dep_map, (c))
+ 
++int lock_class_is_held(struct lock_class_key *key);
++
+ #else /* !CONFIG_LOCKDEP */
+ 
+ static inline void lockdep_init_task(struct task_struct *task)
+@@ -423,6 +425,8 @@ extern int lockdep_is_held(const void *);
+ #define lockdep_repin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
+ #define lockdep_unpin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
+ 
++static inline int lock_class_is_held(struct lock_class_key *key) { return 0; }
++
+ #endif /* !LOCKDEP */
+ 
+ enum xhlock_context_t {
+diff --git a/include/linux/pretty-printers.h b/include/linux/pretty-printers.h
+new file mode 100644
+index 000000000000..f39d8edfba02
+--- /dev/null
++++ b/include/linux/pretty-printers.h
+@@ -0,0 +1,10 @@
++/* SPDX-License-Identifier: LGPL-2.1+ */
++/* Copyright (C) 2022 Kent Overstreet */
++
++#ifndef _LINUX_PRETTY_PRINTERS_H
++#define _LINUX_PRETTY_PRINTERS_H
++
++void prt_string_option(struct printbuf *, const char * const[], size_t);
++void prt_bitflags(struct printbuf *, const char * const[], u64);
++
++#endif /* _LINUX_PRETTY_PRINTERS_H */
+diff --git a/include/linux/printbuf.h b/include/linux/printbuf.h
+new file mode 100644
+index 000000000000..861c5d75f852
+--- /dev/null
++++ b/include/linux/printbuf.h
+@@ -0,0 +1,283 @@
++/* SPDX-License-Identifier: LGPL-2.1+ */
++/* Copyright (C) 2022 Kent Overstreet */
++
++#ifndef _LINUX_PRINTBUF_H
++#define _LINUX_PRINTBUF_H
++
++/*
++ * Printbufs: Simple strings for printing to, with optional heap allocation
++ *
++ * This code has provisions for use in userspace, to aid in making other code
++ * portable between kernelspace and userspace.
++ *
++ * Basic example:
++ *   struct printbuf buf = PRINTBUF;
++ *
++ *   prt_printf(&buf, "foo=");
++ *   foo_to_text(&buf, foo);
++ *   printk("%s", buf.buf);
++ *   printbuf_exit(&buf);
++ *
++ * Or
++ *   struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
++ *
++ * We can now write pretty printers instead of writing code that dumps
++ * everything to the kernel log buffer, and then those pretty-printers can be
++ * used by other code that outputs to kernel log, sysfs, debugfs, etc.
++ *
++ * Memory allocation: Outputing to a printbuf may allocate memory. This
++ * allocation is done with GFP_KERNEL, by default: use the newer
++ * memalloc_*_(save|restore) functions as needed.
++ *
++ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
++ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
++ *
++ * It's allowed to grab the output buffer and free it later with kfree() instead
++ * of using printbuf_exit(), if the user just needs a heap allocated string at
++ * the end.
++ *
++ * Memory allocation failures: We don't return errors directly, because on
++ * memory allocation failure we usually don't want to bail out and unwind - we
++ * want to print what we've got, on a best-effort basis. But code that does want
++ * to return -ENOMEM may check printbuf.allocation_failure.
++ *
++ * Indenting, tabstops:
++ *
++ * To aid is writing multi-line pretty printers spread across multiple
++ * functions, printbufs track the current indent level.
++ *
++ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
++ * level, respectively.
++ *
++ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
++ * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
++ * prt_tab_rjust() will also advance the current line of text up to the next
++ * tabstop, but it does so by shifting text since the previous tabstop up to the
++ * next tabstop - right justifying it.
++ *
++ * Make sure you use prt_newline() instead of \n in the format string for indent
++ * level and tabstops to work corretly.
++ *
++ * Output units: printbuf->units exists to tell pretty-printers how to output
++ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
++ * human readable bytes. prt_units() obeys it.
++ */
++
++#include <linux/kernel.h>
++#include <linux/string.h>
++
++enum printbuf_si {
++	PRINTBUF_UNITS_2,	/* use binary powers of 2^10 */
++	PRINTBUF_UNITS_10,	/* use powers of 10^3 (standard SI) */
++};
++
++struct printbuf {
++	char			*buf;
++	unsigned		size;
++	unsigned		pos;
++	unsigned		last_newline;
++	unsigned		last_field;
++	unsigned		indent;
++	/*
++	 * If nonzero, allocations will be done with GFP_ATOMIC:
++	 */
++	u8			atomic;
++	bool			allocation_failure:1;
++	bool			heap_allocated:1;
++	enum printbuf_si	si_units:1;
++	bool			human_readable_units:1;
++	u8			tabstop;
++	u8			tabstops[4];
++};
++
++int printbuf_make_room(struct printbuf *, unsigned);
++const char *printbuf_str(const struct printbuf *);
++void printbuf_exit(struct printbuf *);
++
++void prt_newline(struct printbuf *);
++void printbuf_indent_add(struct printbuf *, unsigned);
++void printbuf_indent_sub(struct printbuf *, unsigned);
++void prt_tab(struct printbuf *);
++void prt_tab_rjust(struct printbuf *);
++void prt_human_readable_u64(struct printbuf *, u64);
++void prt_human_readable_s64(struct printbuf *, s64);
++void prt_units_u64(struct printbuf *, u64);
++void prt_units_s64(struct printbuf *, s64);
++
++/* Initializer for a heap allocated printbuf: */
++#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
++
++/* Initializer a printbuf that points to an external buffer: */
++#define PRINTBUF_EXTERN(_buf, _size)			\
++((struct printbuf) {					\
++	.buf	= _buf,					\
++	.size	= _size,				\
++})
++
++/*
++ * Returns size remaining of output buffer:
++ */
++static inline unsigned printbuf_remaining_size(struct printbuf *out)
++{
++	return out->pos < out->size ? out->size - out->pos : 0;
++}
++
++/*
++ * Returns number of characters we can print to the output buffer - i.e.
++ * excluding the terminating nul:
++ */
++static inline unsigned printbuf_remaining(struct printbuf *out)
++{
++	return out->pos < out->size ? out->size - out->pos - 1 : 0;
++}
++
++static inline unsigned printbuf_written(struct printbuf *out)
++{
++	return out->size ? min(out->pos, out->size - 1) : 0;
++}
++
++/*
++ * Returns true if output was truncated:
++ */
++static inline bool printbuf_overflowed(struct printbuf *out)
++{
++	return out->pos >= out->size;
++}
++
++static inline void printbuf_nul_terminate(struct printbuf *out)
++{
++	printbuf_make_room(out, 1);
++
++	if (out->pos < out->size)
++		out->buf[out->pos] = 0;
++	else if (out->size)
++		out->buf[out->size - 1] = 0;
++}
++
++/* Doesn't call printbuf_make_room(), doesn't nul terminate: */
++static inline void __prt_char_reserved(struct printbuf *out, char c)
++{
++	if (printbuf_remaining(out))
++		out->buf[out->pos] = c;
++	out->pos++;
++}
++
++/* Doesn't nul terminate: */
++static inline void __prt_char(struct printbuf *out, char c)
++{
++	printbuf_make_room(out, 1);
++	__prt_char_reserved(out, c);
++}
++
++static inline void prt_char(struct printbuf *out, char c)
++{
++	__prt_char(out, c);
++	printbuf_nul_terminate(out);
++}
++
++static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
++{
++	unsigned i, can_print = min(n, printbuf_remaining(out));
++
++	for (i = 0; i < can_print; i++)
++		out->buf[out->pos++] = c;
++	out->pos += n - can_print;
++}
++
++static inline void prt_chars(struct printbuf *out, char c, unsigned n)
++{
++	printbuf_make_room(out, n);
++	__prt_chars_reserved(out, c, n);
++	printbuf_nul_terminate(out);
++}
++
++static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
++{
++	unsigned i, can_print;
++
++	printbuf_make_room(out, n);
++
++	can_print = min(n, printbuf_remaining(out));
++
++	for (i = 0; i < can_print; i++)
++		out->buf[out->pos++] = ((char *) b)[i];
++	out->pos += n - can_print;
++
++	printbuf_nul_terminate(out);
++}
++
++static inline void prt_str(struct printbuf *out, const char *str)
++{
++	prt_bytes(out, str, strlen(str));
++}
++
++static inline void prt_hex_byte(struct printbuf *out, u8 byte)
++{
++	printbuf_make_room(out, 2);
++	__prt_char_reserved(out, hex_asc_hi(byte));
++	__prt_char_reserved(out, hex_asc_lo(byte));
++	printbuf_nul_terminate(out);
++}
++
++static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
++{
++	printbuf_make_room(out, 2);
++	__prt_char_reserved(out, hex_asc_upper_hi(byte));
++	__prt_char_reserved(out, hex_asc_upper_lo(byte));
++	printbuf_nul_terminate(out);
++}
++
++/**
++ * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
++ */
++static inline void printbuf_reset(struct printbuf *buf)
++{
++	buf->pos		= 0;
++	buf->allocation_failure	= 0;
++	buf->indent		= 0;
++	buf->tabstop		= 0;
++}
++
++/**
++ * printbuf_atomic_inc - mark as entering an atomic section
++ */
++static inline void printbuf_atomic_inc(struct printbuf *buf)
++{
++	buf->atomic++;
++}
++
++/**
++ * printbuf_atomic_inc - mark as leaving an atomic section
++ */
++static inline void printbuf_atomic_dec(struct printbuf *buf)
++{
++	buf->atomic--;
++}
++
++/*
++ * This is used for the %pf(%p) sprintf format extension, where we pass a pretty
++ * printer and arguments to the pretty-printer to sprintf
++ *
++ * Instead of passing a pretty-printer function to sprintf directly, we pass it
++ * a pointer to a struct call_pp, so that sprintf can check that the magic
++ * number is present, which in turn ensures that the CALL_PP() macro has been
++ * used in order to typecheck the arguments to the pretty printer function
++ *
++ * Example usage:
++ *   sprintf("%pf(%p)", CALL_PP(prt_bdev, bdev));
++ */
++struct call_pp {
++	unsigned long	magic;
++	void		*fn;
++};
++
++#define PP_TYPECHECK(fn, ...)					\
++	({ while (0) fn((struct printbuf *) NULL, ##__VA_ARGS__); })
++
++#define CALL_PP_MAGIC		(unsigned long) 0xce0b92d22f6b6be4
++
++#define CALL_PP(fn, ...)					\
++	(PP_TYPECHECK(fn, ##__VA_ARGS__),			\
++	 &((struct call_pp) { CALL_PP_MAGIC, fn })), ##__VA_ARGS__
++
++#endif /* _LINUX_PRINTBUF_H */
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index c46f3a63b758..5038c87db740 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -857,6 +857,7 @@ struct task_struct {
+ 
+ 	struct mm_struct		*mm;
+ 	struct mm_struct		*active_mm;
++	struct address_space		*faults_disabled_mapping;
+ 
+ 	/* Per-thread vma caching: */
+ 	struct vmacache			vmacache;
+diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
+deleted file mode 100644
+index 5b31c5147969..000000000000
+--- a/include/linux/seq_buf.h
++++ /dev/null
+@@ -1,162 +0,0 @@
+-/* SPDX-License-Identifier: GPL-2.0 */
+-#ifndef _LINUX_SEQ_BUF_H
+-#define _LINUX_SEQ_BUF_H
+-
+-#include <linux/fs.h>
+-
+-/*
+- * Trace sequences are used to allow a function to call several other functions
+- * to create a string of data to use.
+- */
+-
+-/**
+- * seq_buf - seq buffer structure
+- * @buffer:	pointer to the buffer
+- * @size:	size of the buffer
+- * @len:	the amount of data inside the buffer
+- * @readpos:	The next position to read in the buffer.
+- */
+-struct seq_buf {
+-	char			*buffer;
+-	size_t			size;
+-	size_t			len;
+-	loff_t			readpos;
+-};
+-
+-static inline void seq_buf_clear(struct seq_buf *s)
+-{
+-	s->len = 0;
+-	s->readpos = 0;
+-}
+-
+-static inline void
+-seq_buf_init(struct seq_buf *s, char *buf, unsigned int size)
+-{
+-	s->buffer = buf;
+-	s->size = size;
+-	seq_buf_clear(s);
+-}
+-
+-/*
+- * seq_buf have a buffer that might overflow. When this happens
+- * the len and size are set to be equal.
+- */
+-static inline bool
+-seq_buf_has_overflowed(struct seq_buf *s)
+-{
+-	return s->len > s->size;
+-}
+-
+-static inline void
+-seq_buf_set_overflow(struct seq_buf *s)
+-{
+-	s->len = s->size + 1;
+-}
+-
+-/*
+- * How much buffer is left on the seq_buf?
+- */
+-static inline unsigned int
+-seq_buf_buffer_left(struct seq_buf *s)
+-{
+-	if (seq_buf_has_overflowed(s))
+-		return 0;
+-
+-	return s->size - s->len;
+-}
+-
+-/* How much buffer was written? */
+-static inline unsigned int seq_buf_used(struct seq_buf *s)
+-{
+-	return min(s->len, s->size);
+-}
+-
+-/**
+- * seq_buf_terminate - Make sure buffer is nul terminated
+- * @s: the seq_buf descriptor to terminate.
+- *
+- * This makes sure that the buffer in @s is nul terminated and
+- * safe to read as a string.
+- *
+- * Note, if this is called when the buffer has overflowed, then
+- * the last byte of the buffer is zeroed, and the len will still
+- * point passed it.
+- *
+- * After this function is called, s->buffer is safe to use
+- * in string operations.
+- */
+-static inline void seq_buf_terminate(struct seq_buf *s)
+-{
+-	if (WARN_ON(s->size == 0))
+-		return;
+-
+-	if (seq_buf_buffer_left(s))
+-		s->buffer[s->len] = 0;
+-	else
+-		s->buffer[s->size - 1] = 0;
+-}
+-
+-/**
+- * seq_buf_get_buf - get buffer to write arbitrary data to
+- * @s: the seq_buf handle
+- * @bufp: the beginning of the buffer is stored here
+- *
+- * Return the number of bytes available in the buffer, or zero if
+- * there's no space.
+- */
+-static inline size_t seq_buf_get_buf(struct seq_buf *s, char **bufp)
+-{
+-	WARN_ON(s->len > s->size + 1);
+-
+-	if (s->len < s->size) {
+-		*bufp = s->buffer + s->len;
+-		return s->size - s->len;
+-	}
+-
+-	*bufp = NULL;
+-	return 0;
+-}
+-
+-/**
+- * seq_buf_commit - commit data to the buffer
+- * @s: the seq_buf handle
+- * @num: the number of bytes to commit
+- *
+- * Commit @num bytes of data written to a buffer previously acquired
+- * by seq_buf_get.  To signal an error condition, or that the data
+- * didn't fit in the available space, pass a negative @num value.
+- */
+-static inline void seq_buf_commit(struct seq_buf *s, int num)
+-{
+-	if (num < 0) {
+-		seq_buf_set_overflow(s);
+-	} else {
+-		/* num must be negative on overflow */
+-		BUG_ON(s->len + num > s->size);
+-		s->len += num;
+-	}
+-}
+-
+-extern __printf(2, 3)
+-int seq_buf_printf(struct seq_buf *s, const char *fmt, ...);
+-extern __printf(2, 0)
+-int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args);
+-extern int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s);
+-extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf,
+-			   int cnt);
+-extern int seq_buf_puts(struct seq_buf *s, const char *str);
+-extern int seq_buf_putc(struct seq_buf *s, unsigned char c);
+-extern int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len);
+-extern int seq_buf_putmem_hex(struct seq_buf *s, const void *mem,
+-			      unsigned int len);
+-extern int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc);
+-extern int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str,
+-			    int prefix_type, int rowsize, int groupsize,
+-			    const void *buf, size_t len, bool ascii);
+-
+-#ifdef CONFIG_BINARY_PRINTF
+-extern int
+-seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
+-#endif
+-
+-#endif /* _LINUX_SEQ_BUF_H */
+diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
+index 76fbf92b04d9..12967748f9f7 100644
+--- a/include/linux/shrinker.h
++++ b/include/linux/shrinker.h
+@@ -2,6 +2,8 @@
+ #ifndef _LINUX_SHRINKER_H
+ #define _LINUX_SHRINKER_H
+ 
++struct printbuf;
++
+ /*
+  * This struct is used to pass information from page reclaim to the shrinkers.
+  * We consolidate the values for easier extension later.
+@@ -58,10 +60,12 @@ struct shrink_control {
+  * @flags determine the shrinker abilities, like numa awareness
+  */
+ struct shrinker {
++	char name[32];
+ 	unsigned long (*count_objects)(struct shrinker *,
+ 				       struct shrink_control *sc);
+ 	unsigned long (*scan_objects)(struct shrinker *,
+ 				      struct shrink_control *sc);
++	void (*to_text)(struct printbuf *, struct shrinker *);
+ 
+ 	long batch;	/* reclaim batch size, 0 = default */
+ 	int seeks;	/* seeks to recreate an obj */
+@@ -75,6 +79,9 @@ struct shrinker {
+ #endif
+ 	/* objs pending delete, per node */
+ 	atomic_long_t *nr_deferred;
++
++	atomic_long_t objects_requested_to_free;
++	atomic_long_t objects_freed;
+ };
+ #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
+ 
+@@ -94,4 +101,5 @@ extern int register_shrinker(struct shrinker *shrinker);
+ extern void unregister_shrinker(struct shrinker *shrinker);
+ extern void free_prealloced_shrinker(struct shrinker *shrinker);
+ extern void synchronize_shrinkers(void);
++void shrinkers_to_text(struct printbuf *);
+ #endif
+diff --git a/include/linux/six.h b/include/linux/six.h
+new file mode 100644
+index 000000000000..477c33eb00d7
+--- /dev/null
++++ b/include/linux/six.h
+@@ -0,0 +1,203 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _LINUX_SIX_H
++#define _LINUX_SIX_H
++
++/*
++ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
++ * semaphores, except with a third intermediate state, intent. Basic operations
++ * are:
++ *
++ * six_lock_read(&foo->lock);
++ * six_unlock_read(&foo->lock);
++ *
++ * six_lock_intent(&foo->lock);
++ * six_unlock_intent(&foo->lock);
++ *
++ * six_lock_write(&foo->lock);
++ * six_unlock_write(&foo->lock);
++ *
++ * Intent locks block other intent locks, but do not block read locks, and you
++ * must have an intent lock held before taking a write lock, like so:
++ *
++ * six_lock_intent(&foo->lock);
++ * six_lock_write(&foo->lock);
++ * six_unlock_write(&foo->lock);
++ * six_unlock_intent(&foo->lock);
++ *
++ * Other operations:
++ *
++ *   six_trylock_read()
++ *   six_trylock_intent()
++ *   six_trylock_write()
++ *
++ *   six_lock_downgrade():	convert from intent to read
++ *   six_lock_tryupgrade():	attempt to convert from read to intent
++ *
++ * Locks also embed a sequence number, which is incremented when the lock is
++ * locked or unlocked for write. The current sequence number can be grabbed
++ * while a lock is held from lock->state.seq; then, if you drop the lock you can
++ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
++ * iff it hasn't been locked for write in the meantime.
++ *
++ * There are also operations that take the lock type as a parameter, where the
++ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
++ *
++ *   six_lock_type(lock, type)
++ *   six_unlock_type(lock, type)
++ *   six_relock(lock, type, seq)
++ *   six_trylock_type(lock, type)
++ *   six_trylock_convert(lock, from, to)
++ *
++ * A lock may be held multiple types by the same thread (for read or intent,
++ * not write). However, the six locks code does _not_ implement the actual
++ * recursive checks itself though - rather, if your code (e.g. btree iterator
++ * code) knows that the current thread already has a lock held, and for the
++ * correct type, six_lock_increment() may be used to bump up the counter for
++ * that type - the only effect is that one more call to unlock will be required
++ * before the lock is unlocked.
++ */
++
++#include <linux/lockdep.h>
++#include <linux/osq_lock.h>
++#include <linux/sched.h>
++#include <linux/types.h>
++
++#define SIX_LOCK_SEPARATE_LOCKFNS
++
++union six_lock_state {
++	struct {
++		atomic64_t	counter;
++	};
++
++	struct {
++		u64		v;
++	};
++
++	struct {
++		/* for waitlist_bitnr() */
++		unsigned long	l;
++	};
++
++	struct {
++		unsigned	read_lock:27;
++		unsigned	write_locking:1;
++		unsigned	intent_lock:1;
++		unsigned	waiters:3;
++		/*
++		 * seq works much like in seqlocks: it's incremented every time
++		 * we lock and unlock for write.
++		 *
++		 * If it's odd write lock is held, even unlocked.
++		 *
++		 * Thus readers can unlock, and then lock again later iff it
++		 * hasn't been modified in the meantime.
++		 */
++		u32		seq;
++	};
++};
++
++enum six_lock_type {
++	SIX_LOCK_read,
++	SIX_LOCK_intent,
++	SIX_LOCK_write,
++};
++
++struct six_lock {
++	union six_lock_state	state;
++	unsigned		intent_lock_recurse;
++	struct task_struct	*owner;
++	struct optimistic_spin_queue osq;
++	unsigned __percpu	*readers;
++
++	raw_spinlock_t		wait_lock;
++	struct list_head	wait_list[2];
++#ifdef CONFIG_DEBUG_LOCK_ALLOC
++	struct lockdep_map	dep_map;
++#endif
++};
++
++typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
++
++static __always_inline void __six_lock_init(struct six_lock *lock,
++					    const char *name,
++					    struct lock_class_key *key)
++{
++	atomic64_set(&lock->state.counter, 0);
++	raw_spin_lock_init(&lock->wait_lock);
++	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
++	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
++#ifdef CONFIG_DEBUG_LOCK_ALLOC
++	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
++	lockdep_init_map(&lock->dep_map, name, key, 0);
++#endif
++}
++
++#define six_lock_init(lock)						\
++do {									\
++	static struct lock_class_key __key;				\
++									\
++	__six_lock_init((lock), #lock, &__key);				\
++} while (0)
++
++#define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
++
++#define __SIX_LOCK(type)						\
++bool six_trylock_##type(struct six_lock *);				\
++bool six_relock_##type(struct six_lock *, u32);				\
++int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
++void six_unlock_##type(struct six_lock *);
++
++__SIX_LOCK(read)
++__SIX_LOCK(intent)
++__SIX_LOCK(write)
++#undef __SIX_LOCK
++
++#define SIX_LOCK_DISPATCH(type, fn, ...)			\
++	switch (type) {						\
++	case SIX_LOCK_read:					\
++		return fn##_read(__VA_ARGS__);			\
++	case SIX_LOCK_intent:					\
++		return fn##_intent(__VA_ARGS__);		\
++	case SIX_LOCK_write:					\
++		return fn##_write(__VA_ARGS__);			\
++	default:						\
++		BUG();						\
++	}
++
++static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
++{
++	SIX_LOCK_DISPATCH(type, six_trylock, lock);
++}
++
++static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
++				   unsigned seq)
++{
++	SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
++}
++
++static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
++				six_lock_should_sleep_fn should_sleep_fn, void *p)
++{
++	SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
++}
++
++static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
++{
++	SIX_LOCK_DISPATCH(type, six_unlock, lock);
++}
++
++void six_lock_downgrade(struct six_lock *);
++bool six_lock_tryupgrade(struct six_lock *);
++bool six_trylock_convert(struct six_lock *, enum six_lock_type,
++			 enum six_lock_type);
++
++void six_lock_increment(struct six_lock *, enum six_lock_type);
++
++void six_lock_wakeup_all(struct six_lock *);
++
++void six_lock_pcpu_free_rcu(struct six_lock *);
++void six_lock_pcpu_free(struct six_lock *);
++void six_lock_pcpu_alloc(struct six_lock *);
++
++#endif /* _LINUX_SIX_H */
+diff --git a/include/linux/string.h b/include/linux/string.h
+index 61ec7e4f6311..22a45d553fbc 100644
+--- a/include/linux/string.h
++++ b/include/linux/string.h
+@@ -195,7 +195,12 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s);
+  */
+ #define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s)
+ 
++struct printbuf;
++
+ #ifdef CONFIG_BINARY_PRINTF
++void prt_vbinprintf(struct printbuf *out, const char *fmt, va_list args);
++void prt_bstrprintf(struct printbuf *out, const char *fmt, const u32 *bin_buf);
++void prt_bprintf(struct printbuf *out, const char *fmt, ...) __printf(2, 3);
+ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
+ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
+ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
+diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
+index 4d72258d42fd..52e0f1d283b9 100644
+--- a/include/linux/string_helpers.h
++++ b/include/linux/string_helpers.h
+@@ -10,6 +10,7 @@
+ struct device;
+ struct file;
+ struct task_struct;
++struct printbuf;
+ 
+ /* Descriptions of the types of units to
+  * print in */
+@@ -18,8 +19,8 @@ enum string_size_units {
+ 	STRING_UNITS_2,		/* use binary powers of 2^10 */
+ };
+ 
+-void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
+-		     char *buf, int len);
++int string_get_size(u64 size, u64 blk_size, enum string_size_units units,
++		    char *buf, int len);
+ 
+ #define UNESCAPE_SPACE		BIT(0)
+ #define UNESCAPE_OCTAL		BIT(1)
+@@ -62,6 +63,8 @@ static inline int string_unescape_any_inplace(char *buf)
+ 
+ #define ESCAPE_ALL_MASK		GENMASK(8, 0)
+ 
++void prt_escaped_string(struct printbuf *out, const char *src, size_t isz,
++			unsigned int flags, const char *only);
+ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
+ 		unsigned int flags, const char *only);
+ 
+@@ -71,6 +74,7 @@ static inline int string_escape_mem_any_np(const char *src, size_t isz,
+ 	return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only);
+ }
+ 
++
+ static inline int string_escape_str(const char *src, char *dst, size_t sz,
+ 		unsigned int flags, const char *only)
+ {
+diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
+index e6e95a9f07a5..48471e32f8e4 100644
+--- a/include/linux/trace_events.h
++++ b/include/linux/trace_events.h
+@@ -496,7 +496,7 @@ struct dynevent_cmd;
+ typedef int (*dynevent_create_fn_t)(struct dynevent_cmd *cmd);
+ 
+ struct dynevent_cmd {
+-	struct seq_buf		seq;
++	struct printbuf		seq;
+ 	const char		*event_name;
+ 	unsigned int		n_fields;
+ 	enum dynevent_type	type;
+diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
+index 5a2c650d9e1c..d2b51007b3b9 100644
+--- a/include/linux/trace_seq.h
++++ b/include/linux/trace_seq.h
+@@ -2,10 +2,12 @@
+ #ifndef _LINUX_TRACE_SEQ_H
+ #define _LINUX_TRACE_SEQ_H
+ 
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ 
+ #include <asm/page.h>
+ 
++struct seq_file;
++
+ /*
+  * Trace sequences are used to allow a function to call several other functions
+  * to create a string of data to use (up to a max of PAGE_SIZE).
+@@ -13,14 +15,16 @@
+ 
+ struct trace_seq {
+ 	char			buffer[PAGE_SIZE];
+-	struct seq_buf		seq;
++	struct printbuf		seq;
++	unsigned		readpos;
+ 	int			full;
+ };
+ 
+ static inline void
+ trace_seq_init(struct trace_seq *s)
+ {
+-	seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
++	s->seq = PRINTBUF_EXTERN(s->buffer, PAGE_SIZE);
++	s->readpos = 0;
+ 	s->full = 0;
+ }
+ 
+@@ -39,7 +43,7 @@ trace_seq_init(struct trace_seq *s)
+  */
+ static inline int trace_seq_used(struct trace_seq *s)
+ {
+-	return seq_buf_used(&s->seq);
++	return printbuf_written(&s->seq);
+ }
+ 
+ /**
+@@ -54,7 +58,7 @@ static inline int trace_seq_used(struct trace_seq *s)
+ static inline char *
+ trace_seq_buffer_ptr(struct trace_seq *s)
+ {
+-	return s->buffer + seq_buf_used(&s->seq);
++	return s->buffer + printbuf_written(&s->seq);
+ }
+ 
+ /**
+@@ -66,7 +70,7 @@ trace_seq_buffer_ptr(struct trace_seq *s)
+  */
+ static inline bool trace_seq_has_overflowed(struct trace_seq *s)
+ {
+-	return s->full || seq_buf_has_overflowed(&s->seq);
++	return s->full || printbuf_overflowed(&s->seq);
+ }
+ 
+ /*
+@@ -87,6 +91,7 @@ extern void trace_seq_putc(struct trace_seq *s, unsigned char c);
+ extern void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len);
+ extern void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+ 				unsigned int len);
++struct path;
+ extern int trace_seq_path(struct trace_seq *s, const struct path *path);
+ 
+ extern void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
+index 096d48aa3437..8d11e2e4ddc8 100644
+--- a/include/linux/vmalloc.h
++++ b/include/linux/vmalloc.h
+@@ -144,6 +144,7 @@ extern void *vzalloc(unsigned long size) __alloc_size(1);
+ extern void *vmalloc_user(unsigned long size) __alloc_size(1);
+ extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
+ extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
++extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+ extern void *vmalloc_32(unsigned long size) __alloc_size(1);
+ extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
+ extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
+index 24a509f559ee..0b20ee6854d6 100644
+--- a/include/net/9p/9p.h
++++ b/include/net/9p/9p.h
+@@ -539,12 +539,12 @@ struct p9_rstatfs {
+ struct p9_fcall {
+ 	u32 size;
+ 	u8 id;
++	bool used_mempool;
+ 	u16 tag;
+ 
+ 	size_t offset;
+ 	size_t capacity;
+ 
+-	struct kmem_cache *cache;
+ 	u8 *sdata;
+ };
+ 
+diff --git a/include/net/9p/client.h b/include/net/9p/client.h
+index ec1d1706f43c..832dcc866a20 100644
+--- a/include/net/9p/client.h
++++ b/include/net/9p/client.h
+@@ -9,6 +9,7 @@
+ #ifndef NET_9P_CLIENT_H
+ #define NET_9P_CLIENT_H
+ 
++#include <linux/mempool.h>
+ #include <linux/utsname.h>
+ #include <linux/idr.h>
+ 
+@@ -76,7 +77,7 @@ enum p9_req_status_t {
+ struct p9_req_t {
+ 	int status;
+ 	int t_err;
+-	struct kref refcount;
++	refcount_t refcount;
+ 	wait_queue_head_t wq;
+ 	struct p9_fcall tc;
+ 	struct p9_fcall rc;
+@@ -107,6 +108,14 @@ struct p9_client {
+ 	void *trans;
+ 	struct kmem_cache *fcall_cache;
+ 
++	/*
++	 * We need two identical mempools because it's not safe to allocate
++	 * multiple elements from the same pool (without freeing the first);
++	 * that will deadlock if multiple threads need the last element at the
++	 * same time.
++	 */
++	mempool_t pools[2];
++
+ 	union {
+ 		struct {
+ 			int rfd;
+@@ -222,20 +231,21 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
+ 				kgid_t gid, struct p9_qid *qid);
+ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
+ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
+-void p9_fcall_fini(struct p9_fcall *fc);
++void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
++		   int fc_idx);
+ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag);
+ 
+ static inline void p9_req_get(struct p9_req_t *r)
+ {
+-	kref_get(&r->refcount);
++	refcount_inc(&r->refcount);
+ }
+ 
+ static inline int p9_req_try_get(struct p9_req_t *r)
+ {
+-	return kref_get_unless_zero(&r->refcount);
++	return refcount_inc_not_zero(&r->refcount);
+ }
+ 
+-int p9_req_put(struct p9_req_t *r);
++int p9_req_put(struct p9_client *c, struct p9_req_t *r);
+ 
+ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status);
+ 
+diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
+new file mode 100644
+index 000000000000..140834e7406e
+--- /dev/null
++++ b/include/trace/events/bcachefs.h
+@@ -0,0 +1,1048 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#undef TRACE_SYSTEM
++#define TRACE_SYSTEM bcachefs
++
++#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
++#define _TRACE_BCACHE_H
++
++#include <linux/tracepoint.h>
++
++DECLARE_EVENT_CLASS(bpos,
++	TP_PROTO(struct bpos *p),
++	TP_ARGS(p),
++
++	TP_STRUCT__entry(
++		__field(u64,	inode				)
++		__field(u64,	offset				)
++	),
++
++	TP_fast_assign(
++		__entry->inode	= p->inode;
++		__entry->offset	= p->offset;
++	),
++
++	TP_printk("%llu:%llu", __entry->inode, __entry->offset)
++);
++
++DECLARE_EVENT_CLASS(bkey,
++	TP_PROTO(const struct bkey *k),
++	TP_ARGS(k),
++
++	TP_STRUCT__entry(
++		__field(u64,	inode				)
++		__field(u64,	offset				)
++		__field(u32,	size				)
++	),
++
++	TP_fast_assign(
++		__entry->inode	= k->p.inode;
++		__entry->offset	= k->p.offset;
++		__entry->size	= k->size;
++	),
++
++	TP_printk("%llu:%llu len %u", __entry->inode,
++		  __entry->offset, __entry->size)
++);
++
++DECLARE_EVENT_CLASS(bch_fs,
++	TP_PROTO(struct bch_fs *c),
++	TP_ARGS(c),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= c->dev;
++	),
++
++	TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
++);
++
++DECLARE_EVENT_CLASS(bio,
++	TP_PROTO(struct bio *bio),
++	TP_ARGS(bio),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(sector_t,	sector			)
++		__field(unsigned int,	nr_sector		)
++		__array(char,		rwbs,	6		)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= bio->bi_bdev ? bio_dev(bio) : 0;
++		__entry->sector		= bio->bi_iter.bi_sector;
++		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
++		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
++	),
++
++	TP_printk("%d,%d  %s %llu + %u",
++		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
++		  (unsigned long long)__entry->sector, __entry->nr_sector)
++);
++
++/* super-io.c: */
++TRACE_EVENT(write_super,
++	TP_PROTO(struct bch_fs *c, unsigned long ip),
++	TP_ARGS(c, ip),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev	)
++		__field(unsigned long,	ip	)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= c->dev;
++		__entry->ip		= ip;
++	),
++
++	TP_printk("%d,%d for %pS",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  (void *) __entry->ip)
++);
++
++/* io.c: */
++
++DEFINE_EVENT(bio, read_split,
++	TP_PROTO(struct bio *bio),
++	TP_ARGS(bio)
++);
++
++DEFINE_EVENT(bio, read_bounce,
++	TP_PROTO(struct bio *bio),
++	TP_ARGS(bio)
++);
++
++DEFINE_EVENT(bio, read_retry,
++	TP_PROTO(struct bio *bio),
++	TP_ARGS(bio)
++);
++
++DEFINE_EVENT(bio, promote,
++	TP_PROTO(struct bio *bio),
++	TP_ARGS(bio)
++);
++
++/* Journal */
++
++DEFINE_EVENT(bch_fs, journal_full,
++	TP_PROTO(struct bch_fs *c),
++	TP_ARGS(c)
++);
++
++DEFINE_EVENT(bch_fs, journal_entry_full,
++	TP_PROTO(struct bch_fs *c),
++	TP_ARGS(c)
++);
++
++DEFINE_EVENT(bio, journal_write,
++	TP_PROTO(struct bio *bio),
++	TP_ARGS(bio)
++);
++
++TRACE_EVENT(journal_reclaim_start,
++	TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
++		 u64 min_nr, u64 min_key_cache,
++		 u64 prereserved, u64 prereserved_total,
++		 u64 btree_cache_dirty, u64 btree_cache_total,
++		 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
++	TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
++		btree_cache_dirty, btree_cache_total,
++		btree_key_cache_dirty, btree_key_cache_total),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(bool,		direct			)
++		__field(bool,		kicked			)
++		__field(u64,		min_nr			)
++		__field(u64,		min_key_cache		)
++		__field(u64,		prereserved		)
++		__field(u64,		prereserved_total	)
++		__field(u64,		btree_cache_dirty	)
++		__field(u64,		btree_cache_total	)
++		__field(u64,		btree_key_cache_dirty	)
++		__field(u64,		btree_key_cache_total	)
++	),
++
++	TP_fast_assign(
++		__entry->dev			= c->dev;
++		__entry->direct			= direct;
++		__entry->kicked			= kicked;
++		__entry->min_nr			= min_nr;
++		__entry->min_key_cache		= min_key_cache;
++		__entry->prereserved		= prereserved;
++		__entry->prereserved_total	= prereserved_total;
++		__entry->btree_cache_dirty	= btree_cache_dirty;
++		__entry->btree_cache_total	= btree_cache_total;
++		__entry->btree_key_cache_dirty	= btree_key_cache_dirty;
++		__entry->btree_key_cache_total	= btree_key_cache_total;
++	),
++
++	TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->direct,
++		  __entry->kicked,
++		  __entry->min_nr,
++		  __entry->min_key_cache,
++		  __entry->prereserved,
++		  __entry->prereserved_total,
++		  __entry->btree_cache_dirty,
++		  __entry->btree_cache_total,
++		  __entry->btree_key_cache_dirty,
++		  __entry->btree_key_cache_total)
++);
++
++TRACE_EVENT(journal_reclaim_finish,
++	TP_PROTO(struct bch_fs *c, u64 nr_flushed),
++	TP_ARGS(c, nr_flushed),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(u64,		nr_flushed		)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= c->dev;
++		__entry->nr_flushed	= nr_flushed;
++	),
++
++	TP_printk("%d,%d flushed %llu",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->nr_flushed)
++);
++
++/* allocator: */
++
++/* bset.c: */
++
++DEFINE_EVENT(bpos, bkey_pack_pos_fail,
++	TP_PROTO(struct bpos *p),
++	TP_ARGS(p)
++);
++
++/* Btree */
++
++DECLARE_EVENT_CLASS(btree_node,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(u8,		level			)
++		__field(u8,		id			)
++		__field(u64,		inode			)
++		__field(u64,		offset			)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= c->dev;
++		__entry->level		= b->c.level;
++		__entry->id		= b->c.btree_id;
++		__entry->inode		= b->key.k.p.inode;
++		__entry->offset		= b->key.k.p.offset;
++	),
++
++	TP_printk("%d,%d  %u id %u %llu:%llu",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->level, __entry->id,
++		  __entry->inode, __entry->offset)
++);
++
++DEFINE_EVENT(btree_node, btree_read,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b)
++);
++
++TRACE_EVENT(btree_write,
++	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
++	TP_ARGS(b, bytes, sectors),
++
++	TP_STRUCT__entry(
++		__field(enum btree_node_type,	type)
++		__field(unsigned,	bytes			)
++		__field(unsigned,	sectors			)
++	),
++
++	TP_fast_assign(
++		__entry->type	= btree_node_type(b);
++		__entry->bytes	= bytes;
++		__entry->sectors = sectors;
++	),
++
++	TP_printk("bkey type %u bytes %u sectors %u",
++		  __entry->type , __entry->bytes, __entry->sectors)
++);
++
++DEFINE_EVENT(btree_node, btree_node_alloc,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b)
++);
++
++DEFINE_EVENT(btree_node, btree_node_free,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b)
++);
++
++DEFINE_EVENT(btree_node, btree_node_reap,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b)
++);
++
++DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
++	TP_PROTO(struct bch_fs *c),
++	TP_ARGS(c)
++);
++
++DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
++	TP_PROTO(struct bch_fs *c),
++	TP_ARGS(c)
++);
++
++DEFINE_EVENT(bch_fs, btree_node_cannibalize,
++	TP_PROTO(struct bch_fs *c),
++	TP_ARGS(c)
++);
++
++DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
++	TP_PROTO(struct bch_fs *c),
++	TP_ARGS(c)
++);
++
++TRACE_EVENT(btree_reserve_get_fail,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 size_t required),
++	TP_ARGS(trans_fn, caller_ip, required),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
++		__field(size_t,			required	)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip	= caller_ip;
++		__entry->required	= required;
++	),
++
++	TP_printk("%s %pS required %zu",
++		  __entry->trans_fn,
++		  (void *) __entry->caller_ip,
++		  __entry->required)
++);
++
++DEFINE_EVENT(btree_node, btree_split,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b)
++);
++
++DEFINE_EVENT(btree_node, btree_compact,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b)
++);
++
++DEFINE_EVENT(btree_node, btree_merge,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b)
++);
++
++DEFINE_EVENT(btree_node, btree_rewrite,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b)
++);
++
++DEFINE_EVENT(btree_node, btree_set_root,
++	TP_PROTO(struct bch_fs *c, struct btree *b),
++	TP_ARGS(c, b)
++);
++
++TRACE_EVENT(btree_cache_scan,
++	TP_PROTO(long nr_to_scan, long can_free, long ret),
++	TP_ARGS(nr_to_scan, can_free, ret),
++
++	TP_STRUCT__entry(
++		__field(long,	nr_to_scan		)
++		__field(long,	can_free		)
++		__field(long,	ret			)
++	),
++
++	TP_fast_assign(
++		__entry->nr_to_scan	= nr_to_scan;
++		__entry->can_free	= can_free;
++		__entry->ret		= ret;
++	),
++
++	TP_printk("scanned for %li nodes, can free %li, ret %li",
++		  __entry->nr_to_scan, __entry->can_free, __entry->ret)
++);
++
++TRACE_EVENT(btree_node_relock_fail,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos,
++		 unsigned long node,
++		 u32 iter_lock_seq,
++		 u32 node_lock_seq),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
++		__field(u8,			btree_id	)
++		__field(u64,			pos_inode	)
++		__field(u64,			pos_offset	)
++		__field(u32,			pos_snapshot	)
++		__field(unsigned long,		node		)
++		__field(u32,			iter_lock_seq	)
++		__field(u32,			node_lock_seq	)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip		= caller_ip;
++		__entry->btree_id		= btree_id;
++		__entry->pos_inode		= pos->inode;
++		__entry->pos_offset		= pos->offset;
++		__entry->pos_snapshot		= pos->snapshot;
++		__entry->node			= node;
++		__entry->iter_lock_seq		= iter_lock_seq;
++		__entry->node_lock_seq		= node_lock_seq;
++	),
++
++	TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
++		  __entry->trans_fn,
++		  (void *) __entry->caller_ip,
++		  __entry->btree_id,
++		  __entry->pos_inode,
++		  __entry->pos_offset,
++		  __entry->pos_snapshot,
++		  __entry->node,
++		  __entry->iter_lock_seq,
++		  __entry->node_lock_seq)
++);
++
++/* Garbage collection */
++
++DEFINE_EVENT(bch_fs, gc_gens_start,
++	TP_PROTO(struct bch_fs *c),
++	TP_ARGS(c)
++);
++
++DEFINE_EVENT(bch_fs, gc_gens_end,
++	TP_PROTO(struct bch_fs *c),
++	TP_ARGS(c)
++);
++
++/* Allocator */
++
++TRACE_EVENT(bucket_alloc,
++	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
++	TP_ARGS(ca, alloc_reserve),
++
++	TP_STRUCT__entry(
++		__field(dev_t,			dev	)
++		__array(char,	reserve,	16	)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= ca->dev;
++		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
++	),
++
++	TP_printk("%d,%d reserve %s",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->reserve)
++);
++
++TRACE_EVENT(bucket_alloc_fail,
++	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
++		 u64 free,
++		 u64 avail,
++		 u64 copygc_wait_amount,
++		 s64 copygc_waiting_for,
++		 u64 seen,
++		 u64 open,
++		 u64 need_journal_commit,
++		 u64 nouse,
++		 bool nonblocking,
++		 const char *err),
++	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
++		seen, open, need_journal_commit, nouse, nonblocking, err),
++
++	TP_STRUCT__entry(
++		__field(dev_t,			dev			)
++		__array(char,	reserve,	16			)
++		__field(u64,			free			)
++		__field(u64,			avail			)
++		__field(u64,			copygc_wait_amount	)
++		__field(s64,			copygc_waiting_for	)
++		__field(u64,			seen			)
++		__field(u64,			open			)
++		__field(u64,			need_journal_commit	)
++		__field(u64,			nouse			)
++		__field(bool,			nonblocking		)
++		__array(char,			err,	16		)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= ca->dev;
++		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
++		__entry->free		= free;
++		__entry->avail		= avail;
++		__entry->copygc_wait_amount	= copygc_wait_amount;
++		__entry->copygc_waiting_for	= copygc_waiting_for;
++		__entry->seen		= seen;
++		__entry->open		= open;
++		__entry->need_journal_commit = need_journal_commit;
++		__entry->nouse		= nouse;
++		__entry->nonblocking	= nonblocking;
++		strlcpy(__entry->err, err, sizeof(__entry->err));
++	),
++
++	TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->reserve,
++		  __entry->free,
++		  __entry->avail,
++		  __entry->copygc_wait_amount,
++		  __entry->copygc_waiting_for,
++		  __entry->seen,
++		  __entry->open,
++		  __entry->need_journal_commit,
++		  __entry->nouse,
++		  __entry->nonblocking,
++		  __entry->err)
++);
++
++TRACE_EVENT(discard_buckets,
++	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
++		 u64 need_journal_commit, u64 discarded, const char *err),
++	TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(u64,		seen			)
++		__field(u64,		open			)
++		__field(u64,		need_journal_commit	)
++		__field(u64,		discarded		)
++		__array(char,		err,	16		)
++	),
++
++	TP_fast_assign(
++		__entry->dev			= c->dev;
++		__entry->seen			= seen;
++		__entry->open			= open;
++		__entry->need_journal_commit	= need_journal_commit;
++		__entry->discarded		= discarded;
++		strlcpy(__entry->err, err, sizeof(__entry->err));
++	),
++
++	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->seen,
++		  __entry->open,
++		  __entry->need_journal_commit,
++		  __entry->discarded,
++		  __entry->err)
++);
++
++TRACE_EVENT(invalidate_bucket,
++	TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
++	TP_ARGS(c, dev, bucket, sectors),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(u32,		dev_idx			)
++		__field(u32,		sectors			)
++		__field(u64,		bucket			)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= c->dev;
++		__entry->dev_idx	= dev;
++		__entry->sectors	= sectors;
++		__entry->bucket		= bucket;
++	),
++
++	TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->dev_idx, __entry->bucket,
++		  __entry->sectors)
++);
++
++/* Moving IO */
++
++DEFINE_EVENT(bkey, move_extent,
++	TP_PROTO(const struct bkey *k),
++	TP_ARGS(k)
++);
++
++DEFINE_EVENT(bkey, move_alloc_mem_fail,
++	TP_PROTO(const struct bkey *k),
++	TP_ARGS(k)
++);
++
++DEFINE_EVENT(bkey, move_race,
++	TP_PROTO(const struct bkey *k),
++	TP_ARGS(k)
++);
++
++TRACE_EVENT(move_data,
++	TP_PROTO(struct bch_fs *c, u64 sectors_moved,
++		 u64 keys_moved),
++	TP_ARGS(c, sectors_moved, keys_moved),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(u64,		sectors_moved	)
++		__field(u64,		keys_moved	)
++	),
++
++	TP_fast_assign(
++		__entry->dev			= c->dev;
++		__entry->sectors_moved = sectors_moved;
++		__entry->keys_moved = keys_moved;
++	),
++
++	TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->sectors_moved, __entry->keys_moved)
++);
++
++TRACE_EVENT(copygc,
++	TP_PROTO(struct bch_fs *c,
++		 u64 sectors_moved, u64 sectors_not_moved,
++		 u64 buckets_moved, u64 buckets_not_moved),
++	TP_ARGS(c,
++		sectors_moved, sectors_not_moved,
++		buckets_moved, buckets_not_moved),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(u64,		sectors_moved		)
++		__field(u64,		sectors_not_moved	)
++		__field(u64,		buckets_moved		)
++		__field(u64,		buckets_not_moved	)
++	),
++
++	TP_fast_assign(
++		__entry->dev			= c->dev;
++		__entry->sectors_moved		= sectors_moved;
++		__entry->sectors_not_moved	= sectors_not_moved;
++		__entry->buckets_moved		= buckets_moved;
++		__entry->buckets_not_moved = buckets_moved;
++	),
++
++	TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->sectors_moved, __entry->sectors_not_moved,
++		  __entry->buckets_moved, __entry->buckets_not_moved)
++);
++
++TRACE_EVENT(copygc_wait,
++	TP_PROTO(struct bch_fs *c,
++		 u64 wait_amount, u64 until),
++	TP_ARGS(c, wait_amount, until),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(u64,		wait_amount		)
++		__field(u64,		until			)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= c->dev;
++		__entry->wait_amount	= wait_amount;
++		__entry->until		= until;
++	),
++
++	TP_printk("%d,%u waiting for %llu sectors until %llu",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->wait_amount, __entry->until)
++);
++
++DECLARE_EVENT_CLASS(transaction_event,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip		= caller_ip;
++	),
++
++	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	transaction_commit,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	transaction_restart_ip,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	transaction_restart_injected,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	trans_restart_journal_res_get,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	trans_restart_journal_preres_get,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	trans_restart_journal_reclaim,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	trans_restart_fault_inject,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	trans_traverse_all,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	trans_restart_mark_replicas,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	trans_restart_key_cache_raced,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_event,	trans_restart_too_many_iters,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DECLARE_EVENT_CLASS(transaction_restart_iter,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
++		__field(u8,			btree_id	)
++		__field(u64,			pos_inode	)
++		__field(u64,			pos_offset	)
++		__field(u32,			pos_snapshot	)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip		= caller_ip;
++		__entry->btree_id		= btree_id;
++		__entry->pos_inode		= pos->inode;
++		__entry->pos_offset		= pos->offset;
++		__entry->pos_snapshot		= pos->snapshot;
++	),
++
++	TP_printk("%s %pS btree %u pos %llu:%llu:%u",
++		  __entry->trans_fn,
++		  (void *) __entry->caller_ip,
++		  __entry->btree_id,
++		  __entry->pos_inode,
++		  __entry->pos_offset,
++		  __entry->pos_snapshot)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_reused,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_upgrade,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_iter_upgrade,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_next_node,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_parent_for_fill,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_after_fill,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_key_cache_fill,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path_intent,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_memory_allocation_failure,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++TRACE_EVENT(trans_restart_would_deadlock,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long	caller_ip,
++		 bool		in_traverse_all,
++		 unsigned	reason,
++		 enum btree_id	have_btree_id,
++		 unsigned	have_iter_type,
++		 struct bpos	*have_pos,
++		 enum btree_id	want_btree_id,
++		 unsigned	want_iter_type,
++		 struct bpos	*want_pos),
++	TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason,
++		have_btree_id, have_iter_type, have_pos,
++		want_btree_id, want_iter_type, want_pos),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
++		__field(u8,			in_traverse_all	)
++		__field(u8,			reason		)
++		__field(u8,			have_btree_id	)
++		__field(u8,			have_iter_type	)
++		__field(u8,			want_btree_id	)
++		__field(u8,			want_iter_type	)
++
++		__field(u64,			have_pos_inode	)
++		__field(u64,			have_pos_offset	)
++		__field(u32,			have_pos_snapshot)
++		__field(u32,			want_pos_snapshot)
++		__field(u64,			want_pos_inode	)
++		__field(u64,			want_pos_offset	)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip		= caller_ip;
++		__entry->in_traverse_all	= in_traverse_all;
++		__entry->reason			= reason;
++		__entry->have_btree_id		= have_btree_id;
++		__entry->have_iter_type		= have_iter_type;
++		__entry->want_btree_id		= want_btree_id;
++		__entry->want_iter_type		= want_iter_type;
++
++		__entry->have_pos_inode		= have_pos->inode;
++		__entry->have_pos_offset	= have_pos->offset;
++		__entry->have_pos_snapshot	= have_pos->snapshot;
++
++		__entry->want_pos_inode		= want_pos->inode;
++		__entry->want_pos_offset	= want_pos->offset;
++		__entry->want_pos_snapshot	= want_pos->snapshot;
++	),
++
++	TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
++		  __entry->trans_fn,
++		  (void *) __entry->caller_ip,
++		  __entry->in_traverse_all,
++		  __entry->reason,
++		  __entry->have_btree_id,
++		  __entry->have_iter_type,
++		  __entry->have_pos_inode,
++		  __entry->have_pos_offset,
++		  __entry->have_pos_snapshot,
++		  __entry->want_btree_id,
++		  __entry->want_iter_type,
++		  __entry->want_pos_inode,
++		  __entry->want_pos_offset,
++		  __entry->want_pos_snapshot)
++);
++
++TRACE_EVENT(trans_restart_would_deadlock_write,
++	TP_PROTO(const char *trans_fn),
++	TP_ARGS(trans_fn),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++	),
++
++	TP_printk("%s", __entry->trans_fn)
++);
++
++TRACE_EVENT(trans_restart_mem_realloced,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 unsigned long bytes),
++	TP_ARGS(trans_fn, caller_ip, bytes),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
++		__field(unsigned long,		bytes		)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip	= caller_ip;
++		__entry->bytes		= bytes;
++	),
++
++	TP_printk("%s %pS bytes %lu",
++		  __entry->trans_fn,
++		  (void *) __entry->caller_ip,
++		  __entry->bytes)
++);
++
++TRACE_EVENT(trans_restart_key_cache_key_realloced,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos,
++		 unsigned old_u64s,
++		 unsigned new_u64s),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos, old_u64s, new_u64s),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
++		__field(enum btree_id,		btree_id	)
++		__field(u64,			inode		)
++		__field(u64,			offset		)
++		__field(u32,			snapshot	)
++		__field(u32,			old_u64s	)
++		__field(u32,			new_u64s	)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip	= caller_ip;
++		__entry->btree_id	= btree_id;
++		__entry->inode		= pos->inode;
++		__entry->offset		= pos->offset;
++		__entry->snapshot	= pos->snapshot;
++		__entry->old_u64s	= old_u64s;
++		__entry->new_u64s	= new_u64s;
++	),
++
++	TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
++		  __entry->trans_fn,
++		  (void *) __entry->caller_ip,
++		  bch2_btree_ids[__entry->btree_id],
++		  __entry->inode,
++		  __entry->offset,
++		  __entry->snapshot,
++		  __entry->old_u64s,
++		  __entry->new_u64s)
++);
++
++#endif /* _TRACE_BCACHE_H */
++
++/* This part must be outside protection */
++#include <trace/define_trace.h>
+diff --git a/init/init_task.c b/init/init_task.c
+index 73cc8f03511a..3e3aed110153 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -85,6 +85,7 @@ struct task_struct init_task
+ 	.nr_cpus_allowed= NR_CPUS,
+ 	.mm		= NULL,
+ 	.active_mm	= &init_mm,
++	.faults_disabled_mapping = NULL,
+ 	.restart_block	= {
+ 		.fn = do_no_restart_syscall,
+ 	},
+diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
+index 4198f0273ecd..b2abd9a5d9ab 100644
+--- a/kernel/Kconfig.locks
++++ b/kernel/Kconfig.locks
+@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB
+ config MMIOWB
+ 	def_bool y if ARCH_HAS_MMIOWB
+ 	depends on SMP
++
++config SIXLOCKS
++	bool
+diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
+index d51cabf28f38..cadbf6520c4b 100644
+--- a/kernel/locking/Makefile
++++ b/kernel/locking/Makefile
+@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
+ obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
+ obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
+ obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
++obj-$(CONFIG_SIXLOCKS) += six.o
+diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
+index f06b91ca6482..0b1a3a949b47 100644
+--- a/kernel/locking/lockdep.c
++++ b/kernel/locking/lockdep.c
+@@ -6483,6 +6483,26 @@ void debug_check_no_locks_held(void)
+ }
+ EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
+ 
++#ifdef CONFIG_LOCKDEP
++int lock_class_is_held(struct lock_class_key *key)
++{
++	struct task_struct *curr = current;
++	struct held_lock *hlock;
++
++	if (unlikely(!debug_locks))
++		return 0;
++
++	for (hlock = curr->held_locks;
++	     hlock < curr->held_locks + curr->lockdep_depth;
++	     hlock++)
++		if (hlock->instance->key == key)
++			return 1;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(lock_class_is_held);
++#endif
++
+ #ifdef __KERNEL__
+ void debug_show_all_locks(void)
+ {
+diff --git a/kernel/locking/six.c b/kernel/locking/six.c
+new file mode 100644
+index 000000000000..fca1208720b6
+--- /dev/null
++++ b/kernel/locking/six.c
+@@ -0,0 +1,759 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include <linux/export.h>
++#include <linux/log2.h>
++#include <linux/percpu.h>
++#include <linux/preempt.h>
++#include <linux/rcupdate.h>
++#include <linux/sched.h>
++#include <linux/sched/rt.h>
++#include <linux/six.h>
++#include <linux/slab.h>
++
++#ifdef DEBUG
++#define EBUG_ON(cond)		BUG_ON(cond)
++#else
++#define EBUG_ON(cond)		do {} while (0)
++#endif
++
++#define six_acquire(l, t)	lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
++#define six_release(l)		lock_release(l, _RET_IP_)
++
++struct six_lock_vals {
++	/* Value we add to the lock in order to take the lock: */
++	u64			lock_val;
++
++	/* If the lock has this value (used as a mask), taking the lock fails: */
++	u64			lock_fail;
++
++	/* Value we add to the lock in order to release the lock: */
++	u64			unlock_val;
++
++	/* Mask that indicates lock is held for this type: */
++	u64			held_mask;
++
++	/* Waitlist we wakeup when releasing the lock: */
++	enum six_lock_type	unlock_wakeup;
++};
++
++#define __SIX_LOCK_HELD_read	__SIX_VAL(read_lock, ~0)
++#define __SIX_LOCK_HELD_intent	__SIX_VAL(intent_lock, ~0)
++#define __SIX_LOCK_HELD_write	__SIX_VAL(seq, 1)
++
++#define LOCK_VALS {							\
++	[SIX_LOCK_read] = {						\
++		.lock_val	= __SIX_VAL(read_lock, 1),		\
++		.lock_fail	= __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
++		.unlock_val	= -__SIX_VAL(read_lock, 1),		\
++		.held_mask	= __SIX_LOCK_HELD_read,			\
++		.unlock_wakeup	= SIX_LOCK_write,			\
++	},								\
++	[SIX_LOCK_intent] = {						\
++		.lock_val	= __SIX_VAL(intent_lock, 1),		\
++		.lock_fail	= __SIX_LOCK_HELD_intent,		\
++		.unlock_val	= -__SIX_VAL(intent_lock, 1),		\
++		.held_mask	= __SIX_LOCK_HELD_intent,		\
++		.unlock_wakeup	= SIX_LOCK_intent,			\
++	},								\
++	[SIX_LOCK_write] = {						\
++		.lock_val	= __SIX_VAL(seq, 1),			\
++		.lock_fail	= __SIX_LOCK_HELD_read,			\
++		.unlock_val	= __SIX_VAL(seq, 1),			\
++		.held_mask	= __SIX_LOCK_HELD_write,		\
++		.unlock_wakeup	= SIX_LOCK_read,			\
++	},								\
++}
++
++static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
++				 union six_lock_state old)
++{
++	if (type != SIX_LOCK_intent)
++		return;
++
++	if (!old.intent_lock) {
++		EBUG_ON(lock->owner);
++		lock->owner = current;
++	} else {
++		EBUG_ON(lock->owner != current);
++	}
++}
++
++static inline unsigned pcpu_read_count(struct six_lock *lock)
++{
++	unsigned read_count = 0;
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		read_count += *per_cpu_ptr(lock->readers, cpu);
++	return read_count;
++}
++
++struct six_lock_waiter {
++	struct list_head	list;
++	struct task_struct	*task;
++};
++
++/* This is probably up there with the more evil things I've done */
++#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
++
++static inline void six_lock_wakeup(struct six_lock *lock,
++				   union six_lock_state state,
++				   unsigned waitlist_id)
++{
++	if (waitlist_id == SIX_LOCK_write) {
++		if (state.write_locking && !state.read_lock) {
++			struct task_struct *p = READ_ONCE(lock->owner);
++			if (p)
++				wake_up_process(p);
++		}
++	} else {
++		struct list_head *wait_list = &lock->wait_list[waitlist_id];
++		struct six_lock_waiter *w, *next;
++
++		if (!(state.waiters & (1 << waitlist_id)))
++			return;
++
++		clear_bit(waitlist_bitnr(waitlist_id),
++			  (unsigned long *) &lock->state.v);
++
++		raw_spin_lock(&lock->wait_lock);
++
++		list_for_each_entry_safe(w, next, wait_list, list) {
++			list_del_init(&w->list);
++
++			if (wake_up_process(w->task) &&
++			    waitlist_id != SIX_LOCK_read) {
++				if (!list_empty(wait_list))
++					set_bit(waitlist_bitnr(waitlist_id),
++						(unsigned long *) &lock->state.v);
++				break;
++			}
++		}
++
++		raw_spin_unlock(&lock->wait_lock);
++	}
++}
++
++static __always_inline bool do_six_trylock_type(struct six_lock *lock,
++						enum six_lock_type type,
++						bool try)
++{
++	const struct six_lock_vals l[] = LOCK_VALS;
++	union six_lock_state old, new;
++	bool ret;
++	u64 v;
++
++	EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
++	EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
++
++	EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
++
++	/*
++	 * Percpu reader mode:
++	 *
++	 * The basic idea behind this algorithm is that you can implement a lock
++	 * between two threads without any atomics, just memory barriers:
++	 *
++	 * For two threads you'll need two variables, one variable for "thread a
++	 * has the lock" and another for "thread b has the lock".
++	 *
++	 * To take the lock, a thread sets its variable indicating that it holds
++	 * the lock, then issues a full memory barrier, then reads from the
++	 * other thread's variable to check if the other thread thinks it has
++	 * the lock. If we raced, we backoff and retry/sleep.
++	 */
++
++	if (type == SIX_LOCK_read && lock->readers) {
++retry:
++		preempt_disable();
++		this_cpu_inc(*lock->readers); /* signal that we own lock */
++
++		smp_mb();
++
++		old.v = READ_ONCE(lock->state.v);
++		ret = !(old.v & l[type].lock_fail);
++
++		this_cpu_sub(*lock->readers, !ret);
++		preempt_enable();
++
++		/*
++		 * If we failed because a writer was trying to take the
++		 * lock, issue a wakeup because we might have caused a
++		 * spurious trylock failure:
++		 */
++		if (old.write_locking) {
++			struct task_struct *p = READ_ONCE(lock->owner);
++
++			if (p)
++				wake_up_process(p);
++		}
++
++		/*
++		 * If we failed from the lock path and the waiting bit wasn't
++		 * set, set it:
++		 */
++		if (!try && !ret) {
++			v = old.v;
++
++			do {
++				new.v = old.v = v;
++
++				if (!(old.v & l[type].lock_fail))
++					goto retry;
++
++				if (new.waiters & (1 << type))
++					break;
++
++				new.waiters |= 1 << type;
++			} while ((v = atomic64_cmpxchg(&lock->state.counter,
++						       old.v, new.v)) != old.v);
++		}
++	} else if (type == SIX_LOCK_write && lock->readers) {
++		if (try) {
++			atomic64_add(__SIX_VAL(write_locking, 1),
++				     &lock->state.counter);
++			smp_mb__after_atomic();
++		}
++
++		ret = !pcpu_read_count(lock);
++
++		/*
++		 * On success, we increment lock->seq; also we clear
++		 * write_locking unless we failed from the lock path:
++		 */
++		v = 0;
++		if (ret)
++			v += __SIX_VAL(seq, 1);
++		if (ret || try)
++			v -= __SIX_VAL(write_locking, 1);
++
++		if (try && !ret) {
++			old.v = atomic64_add_return(v, &lock->state.counter);
++			six_lock_wakeup(lock, old, SIX_LOCK_read);
++		} else {
++			atomic64_add(v, &lock->state.counter);
++		}
++	} else {
++		v = READ_ONCE(lock->state.v);
++		do {
++			new.v = old.v = v;
++
++			if (!(old.v & l[type].lock_fail)) {
++				new.v += l[type].lock_val;
++
++				if (type == SIX_LOCK_write)
++					new.write_locking = 0;
++			} else if (!try && type != SIX_LOCK_write &&
++				   !(new.waiters & (1 << type)))
++				new.waiters |= 1 << type;
++			else
++				break; /* waiting bit already set */
++		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
++					old.v, new.v)) != old.v);
++
++		ret = !(old.v & l[type].lock_fail);
++
++		EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
++	}
++
++	if (ret)
++		six_set_owner(lock, type, old);
++
++	EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
++
++	return ret;
++}
++
++__always_inline __flatten
++static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
++{
++	if (!do_six_trylock_type(lock, type, true))
++		return false;
++
++	if (type != SIX_LOCK_write)
++		six_acquire(&lock->dep_map, 1);
++	return true;
++}
++
++__always_inline __flatten
++static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
++			      unsigned seq)
++{
++	const struct six_lock_vals l[] = LOCK_VALS;
++	union six_lock_state old;
++	u64 v;
++
++	EBUG_ON(type == SIX_LOCK_write);
++
++	if (type == SIX_LOCK_read &&
++	    lock->readers) {
++		bool ret;
++
++		preempt_disable();
++		this_cpu_inc(*lock->readers);
++
++		smp_mb();
++
++		old.v = READ_ONCE(lock->state.v);
++		ret = !(old.v & l[type].lock_fail) && old.seq == seq;
++
++		this_cpu_sub(*lock->readers, !ret);
++		preempt_enable();
++
++		/*
++		 * Similar to the lock path, we may have caused a spurious write
++		 * lock fail and need to issue a wakeup:
++		 */
++		if (old.write_locking) {
++			struct task_struct *p = READ_ONCE(lock->owner);
++
++			if (p)
++				wake_up_process(p);
++		}
++
++		if (ret)
++			six_acquire(&lock->dep_map, 1);
++
++		return ret;
++	}
++
++	v = READ_ONCE(lock->state.v);
++	do {
++		old.v = v;
++
++		if (old.seq != seq || old.v & l[type].lock_fail)
++			return false;
++	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
++				old.v,
++				old.v + l[type].lock_val)) != old.v);
++
++	six_set_owner(lock, type, old);
++	if (type != SIX_LOCK_write)
++		six_acquire(&lock->dep_map, 1);
++	return true;
++}
++
++#ifdef CONFIG_LOCK_SPIN_ON_OWNER
++
++static inline int six_can_spin_on_owner(struct six_lock *lock)
++{
++	struct task_struct *owner;
++	int retval = 1;
++
++	if (need_resched())
++		return 0;
++
++	rcu_read_lock();
++	owner = READ_ONCE(lock->owner);
++	if (owner)
++		retval = owner->on_cpu;
++	rcu_read_unlock();
++	/*
++	 * if lock->owner is not set, the mutex owner may have just acquired
++	 * it and not set the owner yet or the mutex has been released.
++	 */
++	return retval;
++}
++
++static inline bool six_spin_on_owner(struct six_lock *lock,
++				     struct task_struct *owner)
++{
++	bool ret = true;
++
++	rcu_read_lock();
++	while (lock->owner == owner) {
++		/*
++		 * Ensure we emit the owner->on_cpu, dereference _after_
++		 * checking lock->owner still matches owner. If that fails,
++		 * owner might point to freed memory. If it still matches,
++		 * the rcu_read_lock() ensures the memory stays valid.
++		 */
++		barrier();
++
++		if (!owner->on_cpu || need_resched()) {
++			ret = false;
++			break;
++		}
++
++		cpu_relax();
++	}
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
++{
++	struct task_struct *task = current;
++
++	if (type == SIX_LOCK_write)
++		return false;
++
++	preempt_disable();
++	if (!six_can_spin_on_owner(lock))
++		goto fail;
++
++	if (!osq_lock(&lock->osq))
++		goto fail;
++
++	while (1) {
++		struct task_struct *owner;
++
++		/*
++		 * If there's an owner, wait for it to either
++		 * release the lock or go to sleep.
++		 */
++		owner = READ_ONCE(lock->owner);
++		if (owner && !six_spin_on_owner(lock, owner))
++			break;
++
++		if (do_six_trylock_type(lock, type, false)) {
++			osq_unlock(&lock->osq);
++			preempt_enable();
++			return true;
++		}
++
++		/*
++		 * When there's no owner, we might have preempted between the
++		 * owner acquiring the lock and setting the owner field. If
++		 * we're an RT task that will live-lock because we won't let
++		 * the owner complete.
++		 */
++		if (!owner && (need_resched() || rt_task(task)))
++			break;
++
++		/*
++		 * The cpu_relax() call is a compiler barrier which forces
++		 * everything in this loop to be re-loaded. We don't need
++		 * memory barriers as we'll eventually observe the right
++		 * values at the cost of a few extra spins.
++		 */
++		cpu_relax();
++	}
++
++	osq_unlock(&lock->osq);
++fail:
++	preempt_enable();
++
++	/*
++	 * If we fell out of the spin path because of need_resched(),
++	 * reschedule now, before we try-lock again. This avoids getting
++	 * scheduled out right after we obtained the lock.
++	 */
++	if (need_resched())
++		schedule();
++
++	return false;
++}
++
++#else /* CONFIG_LOCK_SPIN_ON_OWNER */
++
++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
++{
++	return false;
++}
++
++#endif
++
++noinline
++static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
++				    six_lock_should_sleep_fn should_sleep_fn, void *p)
++{
++	union six_lock_state old;
++	struct six_lock_waiter wait;
++	int ret = 0;
++
++	if (type == SIX_LOCK_write) {
++		EBUG_ON(lock->state.write_locking);
++		atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
++		smp_mb__after_atomic();
++	}
++
++	ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
++	if (ret)
++		goto out_before_sleep;
++
++	if (six_optimistic_spin(lock, type))
++		goto out_before_sleep;
++
++	lock_contended(&lock->dep_map, _RET_IP_);
++
++	INIT_LIST_HEAD(&wait.list);
++	wait.task = current;
++
++	while (1) {
++		set_current_state(TASK_UNINTERRUPTIBLE);
++		if (type == SIX_LOCK_write)
++			EBUG_ON(lock->owner != current);
++		else if (list_empty_careful(&wait.list)) {
++			raw_spin_lock(&lock->wait_lock);
++			list_add_tail(&wait.list, &lock->wait_list[type]);
++			raw_spin_unlock(&lock->wait_lock);
++		}
++
++		if (do_six_trylock_type(lock, type, false))
++			break;
++
++		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
++		if (ret)
++			break;
++
++		schedule();
++	}
++
++	__set_current_state(TASK_RUNNING);
++
++	if (!list_empty_careful(&wait.list)) {
++		raw_spin_lock(&lock->wait_lock);
++		list_del_init(&wait.list);
++		raw_spin_unlock(&lock->wait_lock);
++	}
++out_before_sleep:
++	if (ret && type == SIX_LOCK_write) {
++		old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
++					    &lock->state.counter);
++		six_lock_wakeup(lock, old, SIX_LOCK_read);
++	}
++
++	return ret;
++}
++
++__always_inline
++static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
++			   six_lock_should_sleep_fn should_sleep_fn, void *p)
++{
++	int ret;
++
++	if (type != SIX_LOCK_write)
++		six_acquire(&lock->dep_map, 0);
++
++	ret = do_six_trylock_type(lock, type, true) ? 0
++		: __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
++
++	if (ret && type != SIX_LOCK_write)
++		six_release(&lock->dep_map);
++	if (!ret)
++		lock_acquired(&lock->dep_map, _RET_IP_);
++
++	return ret;
++}
++
++__always_inline __flatten
++static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
++{
++	const struct six_lock_vals l[] = LOCK_VALS;
++	union six_lock_state state;
++
++	EBUG_ON(type == SIX_LOCK_write &&
++		!(lock->state.v & __SIX_LOCK_HELD_intent));
++
++	if (type != SIX_LOCK_write)
++		six_release(&lock->dep_map);
++
++	if (type == SIX_LOCK_intent) {
++		EBUG_ON(lock->owner != current);
++
++		if (lock->intent_lock_recurse) {
++			--lock->intent_lock_recurse;
++			return;
++		}
++
++		lock->owner = NULL;
++	}
++
++	if (type == SIX_LOCK_read &&
++	    lock->readers) {
++		smp_mb(); /* unlock barrier */
++		this_cpu_dec(*lock->readers);
++		smp_mb(); /* between unlocking and checking for waiters */
++		state.v = READ_ONCE(lock->state.v);
++	} else {
++		EBUG_ON(!(lock->state.v & l[type].held_mask));
++		state.v = atomic64_add_return_release(l[type].unlock_val,
++						      &lock->state.counter);
++	}
++
++	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
++}
++
++#define __SIX_LOCK(type)						\
++bool six_trylock_##type(struct six_lock *lock)				\
++{									\
++	return __six_trylock_type(lock, SIX_LOCK_##type);		\
++}									\
++EXPORT_SYMBOL_GPL(six_trylock_##type);					\
++									\
++bool six_relock_##type(struct six_lock *lock, u32 seq)			\
++{									\
++	return __six_relock_type(lock, SIX_LOCK_##type, seq);		\
++}									\
++EXPORT_SYMBOL_GPL(six_relock_##type);					\
++									\
++int six_lock_##type(struct six_lock *lock,				\
++		    six_lock_should_sleep_fn should_sleep_fn, void *p)	\
++{									\
++	return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
++}									\
++EXPORT_SYMBOL_GPL(six_lock_##type);					\
++									\
++void six_unlock_##type(struct six_lock *lock)				\
++{									\
++	__six_unlock_type(lock, SIX_LOCK_##type);			\
++}									\
++EXPORT_SYMBOL_GPL(six_unlock_##type);
++
++__SIX_LOCK(read)
++__SIX_LOCK(intent)
++__SIX_LOCK(write)
++
++#undef __SIX_LOCK
++
++/* Convert from intent to read: */
++void six_lock_downgrade(struct six_lock *lock)
++{
++	six_lock_increment(lock, SIX_LOCK_read);
++	six_unlock_intent(lock);
++}
++EXPORT_SYMBOL_GPL(six_lock_downgrade);
++
++bool six_lock_tryupgrade(struct six_lock *lock)
++{
++	union six_lock_state old, new;
++	u64 v = READ_ONCE(lock->state.v);
++
++	do {
++		new.v = old.v = v;
++
++		if (new.intent_lock)
++			return false;
++
++		if (!lock->readers) {
++			EBUG_ON(!new.read_lock);
++			new.read_lock--;
++		}
++
++		new.intent_lock = 1;
++	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
++				old.v, new.v)) != old.v);
++
++	if (lock->readers)
++		this_cpu_dec(*lock->readers);
++
++	six_set_owner(lock, SIX_LOCK_intent, old);
++
++	return true;
++}
++EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
++
++bool six_trylock_convert(struct six_lock *lock,
++			 enum six_lock_type from,
++			 enum six_lock_type to)
++{
++	EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
++
++	if (to == from)
++		return true;
++
++	if (to == SIX_LOCK_read) {
++		six_lock_downgrade(lock);
++		return true;
++	} else {
++		return six_lock_tryupgrade(lock);
++	}
++}
++EXPORT_SYMBOL_GPL(six_trylock_convert);
++
++/*
++ * Increment read/intent lock count, assuming we already have it read or intent
++ * locked:
++ */
++void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
++{
++	const struct six_lock_vals l[] = LOCK_VALS;
++
++	six_acquire(&lock->dep_map, 0);
++
++	/* XXX: assert already locked, and that we don't overflow: */
++
++	switch (type) {
++	case SIX_LOCK_read:
++		if (lock->readers) {
++			this_cpu_inc(*lock->readers);
++		} else {
++			EBUG_ON(!lock->state.read_lock &&
++				!lock->state.intent_lock);
++			atomic64_add(l[type].lock_val, &lock->state.counter);
++		}
++		break;
++	case SIX_LOCK_intent:
++		EBUG_ON(!lock->state.intent_lock);
++		lock->intent_lock_recurse++;
++		break;
++	case SIX_LOCK_write:
++		BUG();
++		break;
++	}
++}
++EXPORT_SYMBOL_GPL(six_lock_increment);
++
++void six_lock_wakeup_all(struct six_lock *lock)
++{
++	struct six_lock_waiter *w;
++
++	raw_spin_lock(&lock->wait_lock);
++
++	list_for_each_entry(w, &lock->wait_list[0], list)
++		wake_up_process(w->task);
++	list_for_each_entry(w, &lock->wait_list[1], list)
++		wake_up_process(w->task);
++
++	raw_spin_unlock(&lock->wait_lock);
++}
++EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
++
++struct free_pcpu_rcu {
++	struct rcu_head		rcu;
++	void __percpu		*p;
++};
++
++static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
++{
++	struct free_pcpu_rcu *rcu =
++		container_of(_rcu, struct free_pcpu_rcu, rcu);
++
++	free_percpu(rcu->p);
++	kfree(rcu);
++}
++
++void six_lock_pcpu_free_rcu(struct six_lock *lock)
++{
++	struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
++
++	if (!rcu)
++		return;
++
++	rcu->p = lock->readers;
++	lock->readers = NULL;
++
++	call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
++}
++EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
++
++void six_lock_pcpu_free(struct six_lock *lock)
++{
++	BUG_ON(lock->readers && pcpu_read_count(lock));
++	BUG_ON(lock->state.read_lock);
++
++	free_percpu(lock->readers);
++	lock->readers = NULL;
++}
++EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
++
++void six_lock_pcpu_alloc(struct six_lock *lock)
++{
++#ifdef __KERNEL__
++	if (!lock->readers)
++		lock->readers = alloc_percpu(unsigned);
++#endif
++}
++EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
+diff --git a/kernel/module/main.c b/kernel/module/main.c
+index 0548151dd933..55ba98a99387 100644
+--- a/kernel/module/main.c
++++ b/kernel/module/main.c
+@@ -1608,9 +1608,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
+ 
+ void * __weak module_alloc(unsigned long size)
+ {
+-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+-			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+-			NUMA_NO_NODE, __builtin_return_address(0));
++	return vmalloc_exec(size, GFP_KERNEL);
+ }
+ 
+ bool __weak module_init_section(const char *name)
+diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
+index 9ed5ce989415..3428568bb3f1 100644
+--- a/kernel/stacktrace.c
++++ b/kernel/stacktrace.c
+@@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
+ 	put_task_stack(tsk);
+ 	return c.len;
+ }
++EXPORT_SYMBOL(stack_trace_save_tsk);
+ 
+ /**
+  * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+@@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
+ 	save_stack_trace_tsk(task, &trace);
+ 	return trace.nr_entries;
+ }
++EXPORT_SYMBOL(stack_trace_save_tsk);
+ 
+ /**
+  * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
+index b8dd54627075..26cfe909f9af 100644
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -1673,15 +1673,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
+ {
+ 	int len;
+ 
+-	if (trace_seq_used(s) <= s->seq.readpos)
++	if (trace_seq_used(s) <= s->readpos)
+ 		return -EBUSY;
+ 
+-	len = trace_seq_used(s) - s->seq.readpos;
++	len = trace_seq_used(s) - s->readpos;
+ 	if (cnt > len)
+ 		cnt = len;
+-	memcpy(buf, s->buffer + s->seq.readpos, cnt);
++	memcpy(buf, s->buffer + s->readpos, cnt);
+ 
+-	s->seq.readpos += cnt;
++	s->readpos += cnt;
+ 	return cnt;
+ }
+ 
+@@ -3728,11 +3728,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
+ 
+ static const char *show_buffer(struct trace_seq *s)
+ {
+-	struct seq_buf *seq = &s->seq;
+-
+-	seq_buf_terminate(seq);
+-
+-	return seq->buffer;
++	return printbuf_str(&s->seq);
+ }
+ 
+ static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
+@@ -6759,12 +6755,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
+ 	trace_access_lock(iter->cpu_file);
+ 	while (trace_find_next_entry_inc(iter) != NULL) {
+ 		enum print_line_t ret;
+-		int save_len = iter->seq.seq.len;
++		int save_pos = iter->seq.seq.pos;
+ 
+ 		ret = print_trace_line(iter);
+ 		if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ 			/* don't print partial lines */
+-			iter->seq.seq.len = save_len;
++			iter->seq.seq.pos = save_pos;
+ 			break;
+ 		}
+ 		if (ret != TRACE_TYPE_NO_CONSUME)
+@@ -6786,7 +6782,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
+ 
+ 	/* Now copy what we have to the user */
+ 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+-	if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
++	if (iter->seq.readpos >= trace_seq_used(&iter->seq))
+ 		trace_seq_init(&iter->seq);
+ 
+ 	/*
+@@ -6812,16 +6808,15 @@ static size_t
+ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
+ {
+ 	size_t count;
+-	int save_len;
+ 	int ret;
+ 
+ 	/* Seq buffer is page-sized, exactly what we need. */
+ 	for (;;) {
+-		save_len = iter->seq.seq.len;
++		unsigned save_pos = iter->seq.seq.pos;
+ 		ret = print_trace_line(iter);
+ 
+ 		if (trace_seq_has_overflowed(&iter->seq)) {
+-			iter->seq.seq.len = save_len;
++			iter->seq.seq.pos = save_pos;
+ 			break;
+ 		}
+ 
+@@ -6831,14 +6826,14 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
+ 		 * anyway to be safe.
+ 		 */
+ 		if (ret == TRACE_TYPE_PARTIAL_LINE) {
+-			iter->seq.seq.len = save_len;
++			iter->seq.seq.pos = save_pos;
+ 			break;
+ 		}
+ 
+-		count = trace_seq_used(&iter->seq) - save_len;
++		count = trace_seq_used(&iter->seq) - save_pos;
+ 		if (rem < count) {
+ 			rem = 0;
+-			iter->seq.seq.len = save_len;
++			iter->seq.seq.pos = save_pos;
+ 			break;
+ 		}
+ 
+@@ -9827,20 +9822,8 @@ static struct notifier_block trace_die_notifier = {
+ void
+ trace_printk_seq(struct trace_seq *s)
+ {
+-	/* Probably should print a warning here. */
+-	if (s->seq.len >= TRACE_MAX_PRINT)
+-		s->seq.len = TRACE_MAX_PRINT;
+-
+-	/*
+-	 * More paranoid code. Although the buffer size is set to
+-	 * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
+-	 * an extra layer of protection.
+-	 */
+-	if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
+-		s->seq.len = s->seq.size - 1;
+-
+ 	/* should be zero ended, but we are paranoid. */
+-	s->buffer[s->seq.len] = 0;
++	printbuf_nul_terminate(&s->seq);
+ 
+ 	printk(KERN_TRACE "%s", s->buffer);
+ 
+diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
+index 076b447a1b88..30a106c16871 100644
+--- a/kernel/trace/trace_dynevent.c
++++ b/kernel/trace/trace_dynevent.c
+@@ -290,21 +290,19 @@ int dynevent_arg_add(struct dynevent_cmd *cmd,
+ 		     struct dynevent_arg *arg,
+ 		     dynevent_check_arg_fn_t check_arg)
+ {
+-	int ret = 0;
+-
+ 	if (check_arg) {
+-		ret = check_arg(arg);
++		int ret = check_arg(arg);
+ 		if (ret)
+ 			return ret;
+ 	}
+ 
+-	ret = seq_buf_printf(&cmd->seq, " %s%c", arg->str, arg->separator);
+-	if (ret) {
++	prt_printf(&cmd->seq, " %s%c", arg->str, arg->separator);
++	if (printbuf_overflowed(&cmd->seq)) {
+ 		pr_err("String is too long: %s%c\n", arg->str, arg->separator);
+ 		return -E2BIG;
+ 	}
+ 
+-	return ret;
++	return 0;
+ }
+ 
+ /**
+@@ -335,25 +333,23 @@ int dynevent_arg_pair_add(struct dynevent_cmd *cmd,
+ 			  struct dynevent_arg_pair *arg_pair,
+ 			  dynevent_check_arg_fn_t check_arg)
+ {
+-	int ret = 0;
+-
+ 	if (check_arg) {
+-		ret = check_arg(arg_pair);
++		int ret = check_arg(arg_pair);
+ 		if (ret)
+ 			return ret;
+ 	}
+ 
+-	ret = seq_buf_printf(&cmd->seq, " %s%c%s%c", arg_pair->lhs,
+-			     arg_pair->operator, arg_pair->rhs,
+-			     arg_pair->separator);
+-	if (ret) {
++	prt_printf(&cmd->seq, " %s%c%s%c", arg_pair->lhs,
++		   arg_pair->operator, arg_pair->rhs,
++		   arg_pair->separator);
++	if (printbuf_overflowed(&cmd->seq)) {
+ 		pr_err("field string is too long: %s%c%s%c\n", arg_pair->lhs,
+ 		       arg_pair->operator, arg_pair->rhs,
+ 		       arg_pair->separator);
+ 		return -E2BIG;
+ 	}
+ 
+-	return ret;
++	return 0;
+ }
+ 
+ /**
+@@ -368,15 +364,13 @@ int dynevent_arg_pair_add(struct dynevent_cmd *cmd,
+  */
+ int dynevent_str_add(struct dynevent_cmd *cmd, const char *str)
+ {
+-	int ret = 0;
+-
+-	ret = seq_buf_puts(&cmd->seq, str);
+-	if (ret) {
++	prt_str(&cmd->seq, str);
++	if (printbuf_overflowed(&cmd->seq)) {
+ 		pr_err("String is too long: %s\n", str);
+ 		return -E2BIG;
+ 	}
+ 
+-	return ret;
++	return 0;
+ }
+ 
+ /**
+@@ -405,7 +399,7 @@ void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen,
+ {
+ 	memset(cmd, '\0', sizeof(*cmd));
+ 
+-	seq_buf_init(&cmd->seq, buf, maxlen);
++	cmd->seq = PRINTBUF_EXTERN(buf, maxlen);
+ 	cmd->type = type;
+ 	cmd->run_command = run_command;
+ }
+diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
+index 4b1057ab9d96..9d5137df1a15 100644
+--- a/kernel/trace/trace_events_filter.c
++++ b/kernel/trace/trace_events_filter.c
+@@ -1059,7 +1059,7 @@ static void append_filter_err(struct trace_array *tr,
+ 				FILT_ERR_ERRNO, 0);
+ 	}
+ 	trace_seq_putc(s, 0);
+-	buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL);
++	buf = kstrdup(printbuf_str(&s->seq), GFP_KERNEL);
+ 	if (buf) {
+ 		kfree(filter->filter_string);
+ 		filter->filter_string = buf;
+diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
+index 5e8c07aef071..ddb2a2737b82 100644
+--- a/kernel/trace/trace_events_synth.c
++++ b/kernel/trace/trace_events_synth.c
+@@ -5,13 +5,14 @@
+  * Copyright (C) 2015, 2020 Tom Zanussi <tom.zanussi@linux.intel.com>
+  */
+ 
+-#include <linux/module.h>
+ #include <linux/kallsyms.h>
+-#include <linux/security.h>
++#include <linux/module.h>
+ #include <linux/mutex.h>
++#include <linux/printbuf.h>
++#include <linux/rculist.h>
++#include <linux/security.h>
+ #include <linux/slab.h>
+ #include <linux/stacktrace.h>
+-#include <linux/rculist.h>
+ #include <linux/tracefs.h>
+ 
+ /* for gfp flag names */
+@@ -611,7 +612,7 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
+ 	const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
+ 	struct synth_field *field;
+ 	int len, ret = -ENOMEM;
+-	struct seq_buf s;
++	struct printbuf s;
+ 	ssize_t size;
+ 
+ 	if (!strcmp(field_type, "unsigned")) {
+@@ -666,17 +667,15 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
+ 	if (!field->type)
+ 		goto free;
+ 
+-	seq_buf_init(&s, field->type, len);
++	s = PRINTBUF_EXTERN(field->type, len);
+ 	if (prefix)
+-		seq_buf_puts(&s, prefix);
+-	seq_buf_puts(&s, field_type);
++		prt_str(&s, prefix);
++	prt_str(&s, field_type);
+ 	if (array)
+-		seq_buf_puts(&s, array);
+-	if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
++		prt_str(&s, array);
++	if (WARN_ON_ONCE(!printbuf_remaining(&s)))
+ 		goto free;
+ 
+-	s.buffer[s.len] = '\0';
+-
+ 	size = synth_field_size(field->type);
+ 	if (size < 0) {
+ 		if (array)
+@@ -694,13 +693,12 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
+ 			if (!type)
+ 				goto free;
+ 
+-			seq_buf_init(&s, type, len);
+-			seq_buf_puts(&s, "__data_loc ");
+-			seq_buf_puts(&s, field->type);
++			s = PRINTBUF_EXTERN(type, len);
++			prt_str(&s, "__data_loc ");
++			prt_str(&s, field->type);
+ 
+-			if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
++			if (WARN_ON_ONCE(!printbuf_remaining(&s)))
+ 				goto free;
+-			s.buffer[s.len] = '\0';
+ 
+ 			kfree(field->type);
+ 			field->type = type;
+@@ -1514,7 +1512,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd)
+ 	struct synth_event *se;
+ 	int ret;
+ 
+-	ret = create_or_delete_synth_event(cmd->seq.buffer);
++	ret = create_or_delete_synth_event(cmd->seq.buf);
+ 	if (ret)
+ 		return ret;
+ 
+diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
+index 203204cadf92..9f270fdde99b 100644
+--- a/kernel/trace/trace_functions_graph.c
++++ b/kernel/trace/trace_functions_graph.c
+@@ -1022,9 +1022,9 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
+ 		goto out;
+ 
+ 	/* Strip ending newline */
+-	if (s->buffer[s->seq.len - 1] == '\n') {
+-		s->buffer[s->seq.len - 1] = '\0';
+-		s->seq.len--;
++	if (s->buffer[s->seq.pos - 1] == '\n') {
++		s->buffer[s->seq.pos - 1] = '\0';
++		s->seq.pos--;
+ 	}
+ 
+ 	trace_seq_puts(s, " */\n");
+diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
+index a245ea673715..c9f03c2d7c91 100644
+--- a/kernel/trace/trace_kprobe.c
++++ b/kernel/trace/trace_kprobe.c
+@@ -915,7 +915,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command)
+ 
+ static int trace_kprobe_run_command(struct dynevent_cmd *cmd)
+ {
+-	return create_or_delete_trace_kprobe(cmd->seq.buffer);
++	return create_or_delete_trace_kprobe(printbuf_str(&cmd->seq));
+ }
+ 
+ /**
+diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
+index 9c90b3a7dce2..48c08f29c342 100644
+--- a/kernel/trace/trace_seq.c
++++ b/kernel/trace/trace_seq.c
+@@ -25,11 +25,9 @@
+  */
+ #include <linux/uaccess.h>
+ #include <linux/seq_file.h>
++#include <linux/string.h>
+ #include <linux/trace_seq.h>
+ 
+-/* How much buffer is left on the trace_seq? */
+-#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
+-
+ /*
+  * trace_seq should work with being initialized with 0s.
+  */
+@@ -54,7 +52,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
+ 
+ 	__trace_seq_init(s);
+ 
+-	ret = seq_buf_print_seq(m, &s->seq);
++	ret = seq_write(m, s->seq.buf, printbuf_written(&s->seq));
+ 
+ 	/*
+ 	 * Only reset this buffer if we successfully wrote to the
+@@ -80,7 +78,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
+  */
+ void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+ {
+-	unsigned int save_len = s->seq.len;
++	unsigned int save_pos = s->seq.pos;
+ 	va_list ap;
+ 
+ 	if (s->full)
+@@ -89,12 +87,12 @@ void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+ 	__trace_seq_init(s);
+ 
+ 	va_start(ap, fmt);
+-	seq_buf_vprintf(&s->seq, fmt, ap);
++	prt_vprintf(&s->seq, fmt, ap);
+ 	va_end(ap);
+ 
+ 	/* If we can't write it all, don't bother writing anything */
+-	if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+-		s->seq.len = save_len;
++	if (unlikely(printbuf_overflowed(&s->seq))) {
++		s->seq.pos = save_pos;
+ 		s->full = 1;
+ 	}
+ }
+@@ -111,17 +109,17 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
+ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+ 		      int nmaskbits)
+ {
+-	unsigned int save_len = s->seq.len;
++	unsigned int save_pos = s->seq.pos;
+ 
+ 	if (s->full)
+ 		return;
+ 
+ 	__trace_seq_init(s);
+ 
+-	seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp);
++	prt_printf(&s->seq, "%*pb", nmaskbits, maskp);
+ 
+-	if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+-		s->seq.len = save_len;
++	if (unlikely(printbuf_overflowed(&s->seq))) {
++		s->seq.pos = save_pos;
+ 		s->full = 1;
+ 	}
+ }
+@@ -140,18 +138,18 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+  */
+ void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+ {
+-	unsigned int save_len = s->seq.len;
++	unsigned int save_pos = s->seq.pos;
+ 
+ 	if (s->full)
+ 		return;
+ 
+ 	__trace_seq_init(s);
+ 
+-	seq_buf_vprintf(&s->seq, fmt, args);
++	prt_vprintf(&s->seq, fmt, args);
+ 
+ 	/* If we can't write it all, don't bother writing anything */
+-	if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+-		s->seq.len = save_len;
++	if (unlikely(printbuf_overflowed(&s->seq))) {
++		s->seq.pos = save_pos;
+ 		s->full = 1;
+ 	}
+ }
+@@ -174,18 +172,18 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+  */
+ void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+ {
+-	unsigned int save_len = s->seq.len;
++	unsigned int save_pos = s->seq.pos;
+ 
+ 	if (s->full)
+ 		return;
+ 
+ 	__trace_seq_init(s);
+ 
+-	seq_buf_bprintf(&s->seq, fmt, binary);
++	prt_bstrprintf(&s->seq, fmt, binary);
+ 
+ 	/* If we can't write it all, don't bother writing anything */
+-	if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+-		s->seq.len = save_len;
++	if (unlikely(!printbuf_overflowed(&s->seq))) {
++		s->seq.pos = save_pos;
+ 		s->full = 1;
+ 		return;
+ 	}
+@@ -211,12 +209,12 @@ void trace_seq_puts(struct trace_seq *s, const char *str)
+ 
+ 	__trace_seq_init(s);
+ 
+-	if (len > TRACE_SEQ_BUF_LEFT(s)) {
++	if (len > printbuf_remaining(&s->seq)) {
+ 		s->full = 1;
+ 		return;
+ 	}
+ 
+-	seq_buf_putmem(&s->seq, str, len);
++	prt_bytes(&s->seq, str, len);
+ }
+ EXPORT_SYMBOL_GPL(trace_seq_puts);
+ 
+@@ -237,12 +235,12 @@ void trace_seq_putc(struct trace_seq *s, unsigned char c)
+ 
+ 	__trace_seq_init(s);
+ 
+-	if (TRACE_SEQ_BUF_LEFT(s) < 1) {
++	if (!printbuf_remaining(&s->seq)) {
+ 		s->full = 1;
+ 		return;
+ 	}
+ 
+-	seq_buf_putc(&s->seq, c);
++	prt_char(&s->seq, c);
+ }
+ EXPORT_SYMBOL_GPL(trace_seq_putc);
+ 
+@@ -263,12 +261,12 @@ void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+ 
+ 	__trace_seq_init(s);
+ 
+-	if (len > TRACE_SEQ_BUF_LEFT(s)) {
++	if (len > printbuf_remaining(&s->seq)) {
+ 		s->full = 1;
+ 		return;
+ 	}
+ 
+-	seq_buf_putmem(&s->seq, mem, len);
++	prt_bytes(&s->seq, mem, len);
+ }
+ EXPORT_SYMBOL_GPL(trace_seq_putmem);
+ 
+@@ -285,24 +283,17 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem);
+ void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+ 			 unsigned int len)
+ {
+-	unsigned int save_len = s->seq.len;
++	unsigned int save_pos = s->seq.pos;
+ 
+ 	if (s->full)
+ 		return;
+ 
+ 	__trace_seq_init(s);
+ 
+-	/* Each byte is represented by two chars */
+-	if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
+-		s->full = 1;
+-		return;
+-	}
++	prt_hex_bytes(&s->seq, mem, len, 8, ' ');
+ 
+-	/* The added spaces can still cause an overflow */
+-	seq_buf_putmem_hex(&s->seq, mem, len);
+-
+-	if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+-		s->seq.len = save_len;
++	if (unlikely(printbuf_overflowed(&s->seq))) {
++		s->seq.pos = save_pos;
+ 		s->full = 1;
+ 		return;
+ 	}
+@@ -323,22 +314,22 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
+  */
+ int trace_seq_path(struct trace_seq *s, const struct path *path)
+ {
+-	unsigned int save_len = s->seq.len;
++	unsigned int save_pos = s->seq.pos;
+ 
+ 	if (s->full)
+ 		return 0;
+ 
+ 	__trace_seq_init(s);
+ 
+-	if (TRACE_SEQ_BUF_LEFT(s) < 1) {
++	if (printbuf_remaining(&s->seq) < 1) {
+ 		s->full = 1;
+ 		return 0;
+ 	}
+ 
+-	seq_buf_path(&s->seq, path, "\n");
++	prt_path(&s->seq, path, "\n");
+ 
+-	if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+-		s->seq.len = save_len;
++	if (unlikely(printbuf_overflowed(&s->seq))) {
++		s->seq.pos = save_pos;
+ 		s->full = 1;
+ 		return 0;
+ 	}
+@@ -369,8 +360,25 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
+  */
+ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
+ {
++	int ret, len;
++
+ 	__trace_seq_init(s);
+-	return seq_buf_to_user(&s->seq, ubuf, cnt);
++
++	len = printbuf_written(&s->seq);
++	if (len <= s->readpos)
++		return -EBUSY;
++
++	len -= s->readpos;
++	if (cnt > len)
++		cnt = len;
++	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
++	if (ret == cnt)
++		return -EFAULT;
++
++	cnt -= ret;
++
++	s->readpos += cnt;
++	return cnt;
+ }
+ EXPORT_SYMBOL_GPL(trace_seq_to_user);
+ 
+@@ -378,24 +386,19 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,
+ 		       int prefix_type, int rowsize, int groupsize,
+ 		       const void *buf, size_t len, bool ascii)
+ {
+-	unsigned int save_len = s->seq.len;
++	unsigned int save_pos = s->seq.pos;
+ 
+ 	if (s->full)
+ 		return 0;
+ 
+ 	__trace_seq_init(s);
+ 
+-	if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+-		s->full = 1;
+-		return 0;
+-	}
+-
+-	seq_buf_hex_dump(&(s->seq), prefix_str,
+-		   prefix_type, rowsize, groupsize,
+-		   buf, len, ascii);
++	prt_hex_dump(&s->seq, buf, len,
++		     prefix_str, prefix_type,
++		     rowsize, groupsize, ascii);
+ 
+-	if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+-		s->seq.len = save_len;
++	if (unlikely(printbuf_overflowed(&s->seq))) {
++		s->seq.pos = save_pos;
+ 		s->full = 1;
+ 		return 0;
+ 	}
+diff --git a/lib/Kconfig b/lib/Kconfig
+index eaaad4d85bf2..8eb7050fb422 100644
+--- a/lib/Kconfig
++++ b/lib/Kconfig
+@@ -491,6 +491,9 @@ config ASSOCIATIVE_ARRAY
+ 
+ 	  for more information.
+ 
++config CLOSURES
++	bool
++
+ config HAS_IOMEM
+ 	bool
+ 	depends on !NO_IOMEM
+diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
+index 2e24db4bff19..1d4ed12a5355 100644
+--- a/lib/Kconfig.debug
++++ b/lib/Kconfig.debug
+@@ -1646,6 +1646,15 @@ config DEBUG_CREDENTIALS
+ 
+ source "kernel/rcu/Kconfig.debug"
+ 
++config DEBUG_CLOSURES
++	bool "Debug closures (bcache async widgits)"
++	depends on CLOSURES
++	select DEBUG_FS
++	help
++	Keeps all active closures in a linked list and provides a debugfs
++	interface to list them, which makes it possible to see asynchronous
++	operations that get stuck.
++
+ config DEBUG_WQ_FORCE_RR_CPU
+ 	bool "Force round-robin CPU selection for unbound work items"
+ 	depends on DEBUG_KERNEL
+diff --git a/lib/Makefile b/lib/Makefile
+index f99bf61f8bbc..d24209a59df9 100644
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -30,11 +30,11 @@ endif
+ lib-y := ctype.o string.o vsprintf.o cmdline.o \
+ 	 rbtree.o radix-tree.o timerqueue.o xarray.o \
+ 	 idr.o extable.o sha1.o irq_regs.o argv_split.o \
+-	 flex_proportions.o ratelimit.o show_mem.o \
++	 flex_proportions.o ratelimit.o \
+ 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
+-	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
++	 earlycpio.o siphash.o dec_and_lock.o \
+ 	 nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \
+-	 buildid.o
++	 buildid.o printbuf.o pretty-printers.o
+ 
+ lib-$(CONFIG_PRINTK) += dump_stack.o
+ lib-$(CONFIG_SMP) += cpumask.o
+@@ -241,6 +241,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
+ 
+ obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
+ 
++obj-$(CONFIG_CLOSURES) += closure.o
++
+ obj-$(CONFIG_DQL) += dynamic_queue_limits.o
+ 
+ obj-$(CONFIG_GLOB) += glob.o
+diff --git a/drivers/md/bcache/closure.c b/lib/closure.c
+similarity index 88%
+rename from drivers/md/bcache/closure.c
+rename to lib/closure.c
+index d8d9394a6beb..b38ded00b9b0 100644
+--- a/drivers/md/bcache/closure.c
++++ b/lib/closure.c
+@@ -6,13 +6,12 @@
+  * Copyright 2012 Google, Inc.
+  */
+ 
++#include <linux/closure.h>
+ #include <linux/debugfs.h>
+-#include <linux/module.h>
++#include <linux/export.h>
+ #include <linux/seq_file.h>
+ #include <linux/sched/debug.h>
+ 
+-#include "closure.h"
+-
+ static inline void closure_put_after_sub(struct closure *cl, int flags)
+ {
+ 	int r = flags & CLOSURE_REMAINING_MASK;
+@@ -45,6 +44,7 @@ void closure_sub(struct closure *cl, int v)
+ {
+ 	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+ }
++EXPORT_SYMBOL(closure_sub);
+ 
+ /*
+  * closure_put - decrement a closure's refcount
+@@ -53,6 +53,7 @@ void closure_put(struct closure *cl)
+ {
+ 	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+ }
++EXPORT_SYMBOL(closure_put);
+ 
+ /*
+  * closure_wake_up - wake up all closures on a wait list, without memory barrier
+@@ -74,6 +75,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
+ 		closure_sub(cl, CLOSURE_WAITING + 1);
+ 	}
+ }
++EXPORT_SYMBOL(__closure_wake_up);
+ 
+ /**
+  * closure_wait - add a closure to a waitlist
+@@ -93,6 +95,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+ 
+ 	return true;
+ }
++EXPORT_SYMBOL(closure_wait);
+ 
+ struct closure_syncer {
+ 	struct task_struct	*task;
+@@ -127,8 +130,9 @@ void __sched __closure_sync(struct closure *cl)
+ 
+ 	__set_current_state(TASK_RUNNING);
+ }
++EXPORT_SYMBOL(__closure_sync);
+ 
+-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
++#ifdef CONFIG_DEBUG_CLOSURES
+ 
+ static LIST_HEAD(closure_list);
+ static DEFINE_SPINLOCK(closure_list_lock);
+@@ -144,6 +148,7 @@ void closure_debug_create(struct closure *cl)
+ 	list_add(&cl->all, &closure_list);
+ 	spin_unlock_irqrestore(&closure_list_lock, flags);
+ }
++EXPORT_SYMBOL(closure_debug_create);
+ 
+ void closure_debug_destroy(struct closure *cl)
+ {
+@@ -156,8 +161,7 @@ void closure_debug_destroy(struct closure *cl)
+ 	list_del(&cl->all);
+ 	spin_unlock_irqrestore(&closure_list_lock, flags);
+ }
+-
+-static struct dentry *closure_debug;
++EXPORT_SYMBOL(closure_debug_destroy);
+ 
+ static int debug_show(struct seq_file *f, void *data)
+ {
+@@ -181,7 +185,7 @@ static int debug_show(struct seq_file *f, void *data)
+ 			seq_printf(f, " W %pS\n",
+ 				   (void *) cl->waiting_on);
+ 
+-		seq_printf(f, "\n");
++		seq_puts(f, "\n");
+ 	}
+ 
+ 	spin_unlock_irq(&closure_list_lock);
+@@ -190,18 +194,11 @@ static int debug_show(struct seq_file *f, void *data)
+ 
+ DEFINE_SHOW_ATTRIBUTE(debug);
+ 
+-void  __init closure_debug_init(void)
++static int __init closure_debug_init(void)
+ {
+-	if (!IS_ERR_OR_NULL(bcache_debug))
+-		/*
+-		 * it is unnecessary to check return value of
+-		 * debugfs_create_file(), we should not care
+-		 * about this.
+-		 */
+-		closure_debug = debugfs_create_file(
+-			"closures", 0400, bcache_debug, NULL, &debug_fops);
++	debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops);
++	return 0;
+ }
+-#endif
++late_initcall(closure_debug_init)
+ 
+-MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
+-MODULE_LICENSE("GPL");
++#endif
+diff --git a/lib/errname.c b/lib/errname.c
+index 05cbf731545f..82ea4778f478 100644
+--- a/lib/errname.c
++++ b/lib/errname.c
+@@ -222,3 +222,4 @@ const char *errname(int err)
+ 
+ 	return err > 0 ? name + 1 : name;
+ }
++EXPORT_SYMBOL(errname);
+diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
+index f25eb111c051..41f1bcdc4488 100644
+--- a/lib/generic-radix-tree.c
++++ b/lib/generic-radix-tree.c
+@@ -1,4 +1,5 @@
+ 
++#include <linux/atomic.h>
+ #include <linux/export.h>
+ #include <linux/generic-radix-tree.h>
+ #include <linux/gfp.h>
+@@ -166,6 +167,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
+ 	struct genradix_root *r;
+ 	struct genradix_node *n;
+ 	unsigned level, i;
++
++	if (iter->offset == SIZE_MAX)
++		return NULL;
++
+ restart:
+ 	r = READ_ONCE(radix->root);
+ 	if (!r)
+@@ -184,10 +189,17 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
+ 			(GENRADIX_ARY - 1);
+ 
+ 		while (!n->children[i]) {
++			size_t objs_per_ptr = genradix_depth_size(level);
++
++			if (iter->offset + objs_per_ptr < iter->offset) {
++				iter->offset	= SIZE_MAX;
++				iter->pos	= SIZE_MAX;
++				return NULL;
++			}
++
+ 			i++;
+-			iter->offset = round_down(iter->offset +
+-					   genradix_depth_size(level),
+-					   genradix_depth_size(level));
++			iter->offset = round_down(iter->offset + objs_per_ptr,
++						  objs_per_ptr);
+ 			iter->pos = (iter->offset >> PAGE_SHIFT) *
+ 				objs_per_page;
+ 			if (i == GENRADIX_ARY)
+@@ -201,6 +213,64 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
+ }
+ EXPORT_SYMBOL(__genradix_iter_peek);
+ 
++void *__genradix_iter_peek_prev(struct genradix_iter *iter,
++				struct __genradix *radix,
++				size_t objs_per_page,
++				size_t obj_size_plus_page_remainder)
++{
++	struct genradix_root *r;
++	struct genradix_node *n;
++	unsigned level, i;
++
++	if (iter->offset == SIZE_MAX)
++		return NULL;
++
++restart:
++	r = READ_ONCE(radix->root);
++	if (!r)
++		return NULL;
++
++	n	= genradix_root_to_node(r);
++	level	= genradix_root_to_depth(r);
++
++	if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
++		iter->offset = genradix_depth_size(level);
++		iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
++
++		iter->offset -= obj_size_plus_page_remainder;
++		iter->pos--;
++	}
++
++	while (level) {
++		level--;
++
++		i = (iter->offset >> genradix_depth_shift(level)) &
++			(GENRADIX_ARY - 1);
++
++		while (!n->children[i]) {
++			size_t objs_per_ptr = genradix_depth_size(level);
++
++			iter->offset = round_down(iter->offset, objs_per_ptr);
++			iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
++
++			if (!iter->offset)
++				return NULL;
++
++			iter->offset -= obj_size_plus_page_remainder;
++			iter->pos--;
++
++			if (!i)
++				goto restart;
++			--i;
++		}
++
++		n = n->children[i];
++	}
++
++	return &n->data[iter->offset & (PAGE_SIZE - 1)];
++}
++EXPORT_SYMBOL(__genradix_iter_peek_prev);
++
+ static void genradix_free_recurse(struct genradix_node *n, unsigned level)
+ {
+ 	if (level) {
+diff --git a/lib/hexdump.c b/lib/hexdump.c
+index 06833d404398..9556f15ad295 100644
+--- a/lib/hexdump.c
++++ b/lib/hexdump.c
+@@ -9,6 +9,7 @@
+ #include <linux/kernel.h>
+ #include <linux/minmax.h>
+ #include <linux/export.h>
++#include <linux/printbuf.h>
+ #include <asm/unaligned.h>
+ 
+ const char hex_asc[] = "0123456789abcdef";
+@@ -79,32 +80,40 @@ int hex2bin(u8 *dst, const char *src, size_t count)
+ EXPORT_SYMBOL(hex2bin);
+ 
+ /**
+- * bin2hex - convert binary data to an ascii hexadecimal string
+- * @dst: ascii hexadecimal result
+- * @src: binary data
+- * @count: binary data length
++ * prt_hex_bytes - Print a string of hex bytes, with optional separator
++ *
++ * @out: The printbuf to output to
++ * @addr: Buffer to print
++ * @nr: Number of bytes to print
++ * @separator: Optional separator character between each byte
+  */
+-char *bin2hex(char *dst, const void *src, size_t count)
++void prt_hex_bytes(struct printbuf *out, const void *buf, unsigned len,
++		   unsigned groupsize, unsigned separator)
+ {
+-	const unsigned char *_src = src;
++	const u8 *ptr = buf;
++	unsigned i;
+ 
+-	while (count--)
+-		dst = hex_byte_pack(dst, *_src++);
+-	return dst;
++	if (!groupsize)
++		groupsize = 1;
++
++	for (i = 0; i < len ; ++i) {
++		if (i && separator && !(i % groupsize))
++			__prt_char(out, separator);
++		prt_hex_byte(out, ptr[i]);
++	}
+ }
+-EXPORT_SYMBOL(bin2hex);
++EXPORT_SYMBOL(prt_hex_bytes);
+ 
+ /**
+- * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory
++ * prt_hex_line - convert a blob of data to "hex ASCII" in memory
++ * @out: printbuf to output to
+  * @buf: data blob to dump
+  * @len: number of bytes in the @buf
+  * @rowsize: number of bytes to print per line; must be 16 or 32
+  * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1)
+- * @linebuf: where to put the converted data
+- * @linebuflen: total size of @linebuf, including space for terminating NUL
+  * @ascii: include ASCII after the hex output
+  *
+- * hex_dump_to_buffer() works on one "line" of output at a time, i.e.,
++ * prt_hex_line() works on one "line" of output at a time, i.e.,
+  * 16 or 32 bytes of input data converted to hex + ASCII output.
+  *
+  * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data
+@@ -117,22 +126,13 @@ EXPORT_SYMBOL(bin2hex);
+  *
+  * example output buffer:
+  * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f  @ABCDEFGHIJKLMNO
+- *
+- * Return:
+- * The amount of bytes placed in the buffer without terminating NUL. If the
+- * output was truncated, then the return value is the number of bytes
+- * (excluding the terminating NUL) which would have been written to the final
+- * string if enough space had been available.
+  */
+-int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize,
+-		       char *linebuf, size_t linebuflen, bool ascii)
++void prt_hex_line(struct printbuf *out, const void *buf, size_t len,
++		  int rowsize, int groupsize, bool ascii)
+ {
++	unsigned saved_pos = out->pos;
+ 	const u8 *ptr = buf;
+-	int ngroups;
+-	u8 ch;
+-	int j, lx = 0;
+-	int ascii_column;
+-	int ret;
++	int i, ngroups;
+ 
+ 	if (rowsize != 16 && rowsize != 32)
+ 		rowsize = 16;
+@@ -145,84 +145,127 @@ int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize,
+ 		groupsize = 1;
+ 
+ 	ngroups = len / groupsize;
+-	ascii_column = rowsize * 2 + rowsize / groupsize + 1;
+-
+-	if (!linebuflen)
+-		goto overflow1;
+ 
+ 	if (!len)
+-		goto nil;
+-
+-	if (groupsize == 8) {
+-		const u64 *ptr8 = buf;
+-
+-		for (j = 0; j < ngroups; j++) {
+-			ret = snprintf(linebuf + lx, linebuflen - lx,
+-				       "%s%16.16llx", j ? " " : "",
+-				       get_unaligned(ptr8 + j));
+-			if (ret >= linebuflen - lx)
+-				goto overflow1;
+-			lx += ret;
+-		}
+-	} else if (groupsize == 4) {
+-		const u32 *ptr4 = buf;
+-
+-		for (j = 0; j < ngroups; j++) {
+-			ret = snprintf(linebuf + lx, linebuflen - lx,
+-				       "%s%8.8x", j ? " " : "",
+-				       get_unaligned(ptr4 + j));
+-			if (ret >= linebuflen - lx)
+-				goto overflow1;
+-			lx += ret;
+-		}
+-	} else if (groupsize == 2) {
+-		const u16 *ptr2 = buf;
+-
+-		for (j = 0; j < ngroups; j++) {
+-			ret = snprintf(linebuf + lx, linebuflen - lx,
+-				       "%s%4.4x", j ? " " : "",
+-				       get_unaligned(ptr2 + j));
+-			if (ret >= linebuflen - lx)
+-				goto overflow1;
+-			lx += ret;
+-		}
+-	} else {
+-		for (j = 0; j < len; j++) {
+-			if (linebuflen < lx + 2)
+-				goto overflow2;
+-			ch = ptr[j];
+-			linebuf[lx++] = hex_asc_hi(ch);
+-			if (linebuflen < lx + 2)
+-				goto overflow2;
+-			linebuf[lx++] = hex_asc_lo(ch);
+-			if (linebuflen < lx + 2)
+-				goto overflow2;
+-			linebuf[lx++] = ' ';
++		return;
++
++	prt_hex_bytes(out, ptr, len, groupsize, ' ');
++
++	if (ascii) {
++		unsigned ascii_column = rowsize * 2 + rowsize / groupsize + 1;
++
++		prt_chars(out, ' ', max_t(int, 0, ascii_column - (out->pos - saved_pos)));
++
++		for (i = 0; i < len; i++) {
++			u8 ch = ptr[i];
++			prt_char(out, isascii(ch) && isprint(ch) ? ch : '.');
+ 		}
+-		if (j)
+-			lx--;
+ 	}
+-	if (!ascii)
+-		goto nil;
++}
++EXPORT_SYMBOL(prt_hex_line);
+ 
+-	while (lx < ascii_column) {
+-		if (linebuflen < lx + 2)
+-			goto overflow2;
+-		linebuf[lx++] = ' ';
+-	}
+-	for (j = 0; j < len; j++) {
+-		if (linebuflen < lx + 2)
+-			goto overflow2;
+-		ch = ptr[j];
+-		linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.';
++/**
++ * prt_hex_dump - print multiline formatted hex dump
++ * @out: printbuf to output to
++ * @buf: data blob to dump
++ * @len: number of bytes in the @buf
++ * @prefix_str: string to prefix each line with;
++ *  caller supplies trailing spaces for alignment if desired
++ * @prefix_type: controls whether prefix of an offset, address, or none
++ *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
++ * @rowsize: number of bytes to print per line; must be 16 or 32
++ * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1)
++ * @ascii: include ASCII after the hex output
++ *
++ * Function is an analogue of print_hex_dump() and thus has similar interface.
++ *
++ * linebuf size is maximal length for one line.
++ * 32 * 3 - maximum bytes per line, each printed into 2 chars + 1 for
++ *	separating space
++ * 2 - spaces separating hex dump and ascii representation
++ * 32 - ascii representation
++ * 1 - terminating '\0'
++ */
++void prt_hex_dump(struct printbuf *out, const void *buf, size_t len,
++		  const char *prefix_str, int prefix_type,
++		  unsigned rowsize, unsigned groupsize, bool ascii)
++{
++	const u8 *ptr = buf;
++	size_t i;
++
++	if (rowsize != 16 && rowsize != 32)
++		rowsize = 16;
++
++	for (i = 0; i < len; i += rowsize) {
++		prt_str(out, prefix_str);
++
++		switch (prefix_type) {
++		case DUMP_PREFIX_ADDRESS:
++			prt_printf(out, "%p: ", ptr + i);
++			break;
++		case DUMP_PREFIX_OFFSET:
++			prt_printf(out, "%.8zx: ", i);
++			break;
++		}
++
++		prt_hex_line(out, ptr + i, min_t(size_t, len - i, rowsize),
++			     rowsize, groupsize, ascii);
++		prt_char(out, '\n');
+ 	}
+-nil:
+-	linebuf[lx] = '\0';
+-	return lx;
+-overflow2:
+-	linebuf[lx++] = '\0';
+-overflow1:
+-	return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1;
++}
++
++/**
++ * bin2hex - convert binary data to an ascii hexadecimal string
++ * @dst: ascii hexadecimal result
++ * @src: binary data
++ * @count: binary data length
++ */
++char *bin2hex(char *dst, const void *src, size_t count)
++{
++	struct printbuf out = PRINTBUF_EXTERN(dst, count * 4);
++
++	prt_hex_bytes(&out, src, count, 0, 0);
++	return dst + out.pos;
++}
++EXPORT_SYMBOL(bin2hex);
++
++/**
++ * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory
++ * @buf: data blob to dump
++ * @len: number of bytes in the @buf
++ * @rowsize: number of bytes to print per line; must be 16 or 32
++ * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1)
++ * @linebuf: where to put the converted data
++ * @linebuflen: total size of @linebuf, including space for terminating NUL
++ * @ascii: include ASCII after the hex output
++ *
++ * hex_dump_to_buffer() works on one "line" of output at a time, i.e.,
++ * 16 or 32 bytes of input data converted to hex + ASCII output.
++ *
++ * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data
++ * to a hex + ASCII dump at the supplied memory location.
++ * The converted output is always NUL-terminated.
++ *
++ * E.g.:
++ *   hex_dump_to_buffer(frame->data, frame->len, 16, 1,
++ *			linebuf, sizeof(linebuf), true);
++ *
++ * example output buffer:
++ * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f  @ABCDEFGHIJKLMNO
++ *
++ * Return:
++ * The amount of bytes placed in the buffer without terminating NUL. If the
++ * output was truncated, then the return value is the number of bytes
++ * (excluding the terminating NUL) which would have been written to the final
++ * string if enough space had been available.
++ */
++int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize,
++		       char *linebuf, size_t linebuflen, bool ascii)
++{
++	struct printbuf out = PRINTBUF_EXTERN(linebuf, linebuflen);
++
++	prt_hex_line(&out, buf, len, rowsize, groupsize, ascii);
++	return out.pos;
+ }
+ EXPORT_SYMBOL(hex_dump_to_buffer);
+ 
+@@ -262,6 +305,11 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type,
+ 		    int rowsize, int groupsize,
+ 		    const void *buf, size_t len, bool ascii)
+ {
++	/*
++	 * XXX: this code does the exact same thing as prt_hex_dump(): we should
++	 * be able to call that and printk() the result, except printk is
++	 * restricted to 1024 bytes of output per call
++	 */
+ 	const u8 *ptr = buf;
+ 	int i, linelen, remaining = len;
+ 	unsigned char linebuf[32 * 3 + 2 + 32 + 1];
+diff --git a/lib/pretty-printers.c b/lib/pretty-printers.c
+new file mode 100644
+index 000000000000..addbac95e065
+--- /dev/null
++++ b/lib/pretty-printers.c
+@@ -0,0 +1,60 @@
++// SPDX-License-Identifier: LGPL-2.1+
++/* Copyright (C) 2022 Kent Overstreet */
++
++#include <linux/bitops.h>
++#include <linux/kernel.h>
++#include <linux/printbuf.h>
++#include <linux/pretty-printers.h>
++
++/**
++ * prt_string_option - Given a list of strings, print out the list and indicate
++ * which option is selected, with square brackets (sysfs style)
++ *
++ * @out: The printbuf to output to
++ * @list: List of strings to choose from
++ * @selected: The option to highlight, with square brackets
++ */
++void prt_string_option(struct printbuf *out,
++		       const char * const list[],
++		       size_t selected)
++{
++	size_t i;
++
++	for (i = 0; list[i]; i++) {
++		if (i)
++			prt_char(out, ' ');
++		if (i == selected)
++			prt_char(out, '[');
++		prt_str(out, list[i]);
++		if (i == selected)
++			prt_char(out, ']');
++	}
++}
++EXPORT_SYMBOL(prt_string_option);
++
++/**
++ * prt_bitflags: Given a bitmap and a list of names for each bit, print out which
++ * bits are on, comma separated
++ *
++ * @out: The printbuf to output to
++ * @list: List of names for each bit
++ * @flags: Bits to print
++ */
++void prt_bitflags(struct printbuf *out,
++		  const char * const list[], u64 flags)
++{
++	unsigned bit, nr = 0;
++	bool first = true;
++
++	while (list[nr])
++		nr++;
++
++	while (flags && (bit = __ffs(flags)) < nr) {
++		if (!first)
++			prt_char(out, ',');
++		first = false;
++		prt_str(out, list[bit]);
++		flags ^= 1 << bit;
++	}
++}
++EXPORT_SYMBOL(prt_bitflags);
+diff --git a/lib/printbuf.c b/lib/printbuf.c
+new file mode 100644
+index 000000000000..047470025748
+--- /dev/null
++++ b/lib/printbuf.c
+@@ -0,0 +1,258 @@
++// SPDX-License-Identifier: LGPL-2.1+
++/* Copyright (C) 2022 Kent Overstreet */
++
++#ifdef __KERNEL__
++#include <linux/export.h>
++#include <linux/kernel.h>
++#else
++#define EXPORT_SYMBOL(x)
++#endif
++
++#include <linux/err.h>
++#include <linux/slab.h>
++#include <linux/string_helpers.h>
++#include <linux/printbuf.h>
++
++static inline size_t printbuf_linelen(struct printbuf *buf)
++{
++	return buf->pos - buf->last_newline;
++}
++
++int printbuf_make_room(struct printbuf *out, unsigned extra)
++{
++	unsigned new_size;
++	char *buf;
++
++	if (!out->heap_allocated)
++		return 0;
++
++	/* Reserved space for terminating nul: */
++	extra += 1;
++
++	if (out->pos + extra < out->size)
++		return 0;
++
++	new_size = roundup_pow_of_two(out->size + extra);
++
++	/*
++	 * Note: output buffer must be freeable with kfree(), it's not required
++	 * that the user use printbuf_exit().
++	 */
++	buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
++
++	if (!buf) {
++		out->allocation_failure = true;
++		return -ENOMEM;
++	}
++
++	out->buf	= buf;
++	out->size	= new_size;
++	return 0;
++}
++EXPORT_SYMBOL(printbuf_make_room);
++
++/**
++ * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
++ * terminated
++ */
++const char *printbuf_str(const struct printbuf *buf)
++{
++	/*
++	 * If we've written to a printbuf then it's guaranteed to be a null
++	 * terminated string - but if we haven't, then we might not have
++	 * allocated a buffer at all:
++	 */
++	return buf->pos
++		? buf->buf
++		: "";
++}
++EXPORT_SYMBOL(printbuf_str);
++
++/**
++ * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
++ * against accidental use.
++ */
++void printbuf_exit(struct printbuf *buf)
++{
++	if (buf->heap_allocated) {
++		kfree(buf->buf);
++		buf->buf = ERR_PTR(-EINTR); /* poison value */
++	}
++}
++EXPORT_SYMBOL(printbuf_exit);
++
++void prt_newline(struct printbuf *buf)
++{
++	unsigned i;
++
++	printbuf_make_room(buf, 1 + buf->indent);
++
++	__prt_char(buf, '\n');
++
++	buf->last_newline	= buf->pos;
++
++	for (i = 0; i < buf->indent; i++)
++		__prt_char(buf, ' ');
++
++	printbuf_nul_terminate(buf);
++
++	buf->last_field		= buf->pos;
++	buf->tabstop = 0;
++}
++EXPORT_SYMBOL(prt_newline);
++
++/**
++ * printbuf_indent_add - add to the current indent level
++ *
++ * @buf: printbuf to control
++ * @spaces: number of spaces to add to the current indent level
++ *
++ * Subsequent lines, and the current line if the output position is at the start
++ * of the current line, will be indented by @spaces more spaces.
++ */
++void printbuf_indent_add(struct printbuf *buf, unsigned spaces)
++{
++	if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
++		spaces = 0;
++
++	buf->indent += spaces;
++	while (spaces--)
++		prt_char(buf, ' ');
++}
++EXPORT_SYMBOL(printbuf_indent_add);
++
++/**
++ * printbuf_indent_sub - subtract from the current indent level
++ *
++ * @buf: printbuf to control
++ * @spaces: number of spaces to subtract from the current indent level
++ *
++ * Subsequent lines, and the current line if the output position is at the start
++ * of the current line, will be indented by @spaces less spaces.
++ */
++void printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
++{
++	if (WARN_ON_ONCE(spaces > buf->indent))
++		spaces = buf->indent;
++
++	if (buf->last_newline + buf->indent == buf->pos) {
++		buf->pos -= spaces;
++		printbuf_nul_terminate(buf);
++	}
++	buf->indent -= spaces;
++}
++EXPORT_SYMBOL(printbuf_indent_sub);
++
++/**
++ * prt_tab - Advance printbuf to the next tabstop
++ *
++ * @buf: printbuf to control
++ *
++ * Advance output to the next tabstop by printing spaces.
++ */
++void prt_tab(struct printbuf *out)
++{
++	int spaces = max_t(int, 0, out->tabstops[out->tabstop] - printbuf_linelen(out));
++
++	BUG_ON(out->tabstop > ARRAY_SIZE(out->tabstops));
++
++	prt_chars(out, ' ', spaces);
++
++	out->last_field = out->pos;
++	out->tabstop++;
++}
++EXPORT_SYMBOL(prt_tab);
++
++/**
++ * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
++ * previous output
++ *
++ * @buf: printbuf to control
++ *
++ * Advance output to the next tabstop by inserting spaces immediately after the
++ * previous tabstop, right justifying previously outputted text.
++ */
++void prt_tab_rjust(struct printbuf *buf)
++{
++	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
++
++	if (printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) {
++		unsigned move = buf->pos - buf->last_field;
++		unsigned shift = buf->tabstops[buf->tabstop] -
++			printbuf_linelen(buf);
++
++		printbuf_make_room(buf, shift);
++
++		if (buf->last_field + shift < buf->size)
++			memmove(buf->buf + buf->last_field + shift,
++				buf->buf + buf->last_field,
++				min(move, buf->size - 1 - buf->last_field - shift));
++
++		if (buf->last_field < buf->size)
++			memset(buf->buf + buf->last_field, ' ',
++			       min(shift, buf->size - buf->last_field));
++
++		buf->pos += shift;
++		printbuf_nul_terminate(buf);
++	}
++
++	buf->last_field = buf->pos;
++	buf->tabstop++;
++}
++EXPORT_SYMBOL(prt_tab_rjust);
++
++/**
++ * prt_human_readable_u64 - Print out a u64 in human readable units
++ *
++ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
++ */
++void prt_human_readable_u64(struct printbuf *buf, u64 v)
++{
++	printbuf_make_room(buf, 10);
++	buf->pos += string_get_size(v, 1, !buf->si_units,
++				    buf->buf + buf->pos,
++				    printbuf_remaining_size(buf));
++}
++EXPORT_SYMBOL(prt_human_readable_u64);
++
++/**
++ * prt_human_readable_s64 - Print out a s64 in human readable units
++ *
++ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
++ */
++void prt_human_readable_s64(struct printbuf *buf, s64 v)
++{
++	if (v < 0)
++		prt_char(buf, '-');
++	prt_human_readable_u64(buf, abs(v));
++}
++EXPORT_SYMBOL(prt_human_readable_s64);
++
++/**
++ * prt_units_u64 - Print out a u64 according to printbuf unit options
++ *
++ * Units are either raw (default), or human reabable units (controlled via
++ * @buf->human_readable_units)
++ */
++void prt_units_u64(struct printbuf *out, u64 v)
++{
++	if (out->human_readable_units)
++		prt_human_readable_u64(out, v);
++	else
++		prt_printf(out, "%llu", v);
++}
++EXPORT_SYMBOL(prt_units_u64);
++
++/**
++ * prt_units_s64 - Print out a s64 according to printbuf unit options
++ *
++ * Units are either raw (default), or human reabable units (controlled via
++ * @buf->human_readable_units)
++ */
++void prt_units_s64(struct printbuf *out, s64 v)
++{
++	if (v < 0)
++		prt_char(out, '-');
++	prt_units_u64(out, abs(v));
++}
++EXPORT_SYMBOL(prt_units_s64);
+diff --git a/lib/seq_buf.c b/lib/seq_buf.c
+deleted file mode 100644
+index 0a68f7aa85d6..000000000000
+--- a/lib/seq_buf.c
++++ /dev/null
+@@ -1,397 +0,0 @@
+-// SPDX-License-Identifier: GPL-2.0
+-/*
+- * seq_buf.c
+- *
+- * Copyright (C) 2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+- *
+- * The seq_buf is a handy tool that allows you to pass a descriptor around
+- * to a buffer that other functions can write to. It is similar to the
+- * seq_file functionality but has some differences.
+- *
+- * To use it, the seq_buf must be initialized with seq_buf_init().
+- * This will set up the counters within the descriptor. You can call
+- * seq_buf_init() more than once to reset the seq_buf to start
+- * from scratch.
+- */
+-#include <linux/uaccess.h>
+-#include <linux/seq_file.h>
+-#include <linux/seq_buf.h>
+-
+-/**
+- * seq_buf_can_fit - can the new data fit in the current buffer?
+- * @s: the seq_buf descriptor
+- * @len: The length to see if it can fit in the current buffer
+- *
+- * Returns true if there's enough unused space in the seq_buf buffer
+- * to fit the amount of new data according to @len.
+- */
+-static bool seq_buf_can_fit(struct seq_buf *s, size_t len)
+-{
+-	return s->len + len <= s->size;
+-}
+-
+-/**
+- * seq_buf_print_seq - move the contents of seq_buf into a seq_file
+- * @m: the seq_file descriptor that is the destination
+- * @s: the seq_buf descriptor that is the source.
+- *
+- * Returns zero on success, non zero otherwise
+- */
+-int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s)
+-{
+-	unsigned int len = seq_buf_used(s);
+-
+-	return seq_write(m, s->buffer, len);
+-}
+-
+-/**
+- * seq_buf_vprintf - sequence printing of information.
+- * @s: seq_buf descriptor
+- * @fmt: printf format string
+- * @args: va_list of arguments from a printf() type function
+- *
+- * Writes a vnprintf() format into the sequencce buffer.
+- *
+- * Returns zero on success, -1 on overflow.
+- */
+-int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args)
+-{
+-	int len;
+-
+-	WARN_ON(s->size == 0);
+-
+-	if (s->len < s->size) {
+-		len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args);
+-		if (s->len + len < s->size) {
+-			s->len += len;
+-			return 0;
+-		}
+-	}
+-	seq_buf_set_overflow(s);
+-	return -1;
+-}
+-
+-/**
+- * seq_buf_printf - sequence printing of information
+- * @s: seq_buf descriptor
+- * @fmt: printf format string
+- *
+- * Writes a printf() format into the sequence buffer.
+- *
+- * Returns zero on success, -1 on overflow.
+- */
+-int seq_buf_printf(struct seq_buf *s, const char *fmt, ...)
+-{
+-	va_list ap;
+-	int ret;
+-
+-	va_start(ap, fmt);
+-	ret = seq_buf_vprintf(s, fmt, ap);
+-	va_end(ap);
+-
+-	return ret;
+-}
+-EXPORT_SYMBOL_GPL(seq_buf_printf);
+-
+-#ifdef CONFIG_BINARY_PRINTF
+-/**
+- * seq_buf_bprintf - Write the printf string from binary arguments
+- * @s: seq_buf descriptor
+- * @fmt: The format string for the @binary arguments
+- * @binary: The binary arguments for @fmt.
+- *
+- * When recording in a fast path, a printf may be recorded with just
+- * saving the format and the arguments as they were passed to the
+- * function, instead of wasting cycles converting the arguments into
+- * ASCII characters. Instead, the arguments are saved in a 32 bit
+- * word array that is defined by the format string constraints.
+- *
+- * This function will take the format and the binary array and finish
+- * the conversion into the ASCII string within the buffer.
+- *
+- * Returns zero on success, -1 on overflow.
+- */
+-int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary)
+-{
+-	unsigned int len = seq_buf_buffer_left(s);
+-	int ret;
+-
+-	WARN_ON(s->size == 0);
+-
+-	if (s->len < s->size) {
+-		ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+-		if (s->len + ret < s->size) {
+-			s->len += ret;
+-			return 0;
+-		}
+-	}
+-	seq_buf_set_overflow(s);
+-	return -1;
+-}
+-#endif /* CONFIG_BINARY_PRINTF */
+-
+-/**
+- * seq_buf_puts - sequence printing of simple string
+- * @s: seq_buf descriptor
+- * @str: simple string to record
+- *
+- * Copy a simple string into the sequence buffer.
+- *
+- * Returns zero on success, -1 on overflow
+- */
+-int seq_buf_puts(struct seq_buf *s, const char *str)
+-{
+-	size_t len = strlen(str);
+-
+-	WARN_ON(s->size == 0);
+-
+-	/* Add 1 to len for the trailing null byte which must be there */
+-	len += 1;
+-
+-	if (seq_buf_can_fit(s, len)) {
+-		memcpy(s->buffer + s->len, str, len);
+-		/* Don't count the trailing null byte against the capacity */
+-		s->len += len - 1;
+-		return 0;
+-	}
+-	seq_buf_set_overflow(s);
+-	return -1;
+-}
+-
+-/**
+- * seq_buf_putc - sequence printing of simple character
+- * @s: seq_buf descriptor
+- * @c: simple character to record
+- *
+- * Copy a single character into the sequence buffer.
+- *
+- * Returns zero on success, -1 on overflow
+- */
+-int seq_buf_putc(struct seq_buf *s, unsigned char c)
+-{
+-	WARN_ON(s->size == 0);
+-
+-	if (seq_buf_can_fit(s, 1)) {
+-		s->buffer[s->len++] = c;
+-		return 0;
+-	}
+-	seq_buf_set_overflow(s);
+-	return -1;
+-}
+-
+-/**
+- * seq_buf_putmem - write raw data into the sequenc buffer
+- * @s: seq_buf descriptor
+- * @mem: The raw memory to copy into the buffer
+- * @len: The length of the raw memory to copy (in bytes)
+- *
+- * There may be cases where raw memory needs to be written into the
+- * buffer and a strcpy() would not work. Using this function allows
+- * for such cases.
+- *
+- * Returns zero on success, -1 on overflow
+- */
+-int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len)
+-{
+-	WARN_ON(s->size == 0);
+-
+-	if (seq_buf_can_fit(s, len)) {
+-		memcpy(s->buffer + s->len, mem, len);
+-		s->len += len;
+-		return 0;
+-	}
+-	seq_buf_set_overflow(s);
+-	return -1;
+-}
+-
+-#define MAX_MEMHEX_BYTES	8U
+-#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
+-
+-/**
+- * seq_buf_putmem_hex - write raw memory into the buffer in ASCII hex
+- * @s: seq_buf descriptor
+- * @mem: The raw memory to write its hex ASCII representation of
+- * @len: The length of the raw memory to copy (in bytes)
+- *
+- * This is similar to seq_buf_putmem() except instead of just copying the
+- * raw memory into the buffer it writes its ASCII representation of it
+- * in hex characters.
+- *
+- * Returns zero on success, -1 on overflow
+- */
+-int seq_buf_putmem_hex(struct seq_buf *s, const void *mem,
+-		       unsigned int len)
+-{
+-	unsigned char hex[HEX_CHARS];
+-	const unsigned char *data = mem;
+-	unsigned int start_len;
+-	int i, j;
+-
+-	WARN_ON(s->size == 0);
+-
+-	BUILD_BUG_ON(MAX_MEMHEX_BYTES * 2 >= HEX_CHARS);
+-
+-	while (len) {
+-		start_len = min(len, MAX_MEMHEX_BYTES);
+-#ifdef __BIG_ENDIAN
+-		for (i = 0, j = 0; i < start_len; i++) {
+-#else
+-		for (i = start_len-1, j = 0; i >= 0; i--) {
+-#endif
+-			hex[j++] = hex_asc_hi(data[i]);
+-			hex[j++] = hex_asc_lo(data[i]);
+-		}
+-		if (WARN_ON_ONCE(j == 0 || j/2 > len))
+-			break;
+-
+-		/* j increments twice per loop */
+-		hex[j++] = ' ';
+-
+-		seq_buf_putmem(s, hex, j);
+-		if (seq_buf_has_overflowed(s))
+-			return -1;
+-
+-		len -= start_len;
+-		data += start_len;
+-	}
+-	return 0;
+-}
+-
+-/**
+- * seq_buf_path - copy a path into the sequence buffer
+- * @s: seq_buf descriptor
+- * @path: path to write into the sequence buffer.
+- * @esc: set of characters to escape in the output
+- *
+- * Write a path name into the sequence buffer.
+- *
+- * Returns the number of written bytes on success, -1 on overflow
+- */
+-int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc)
+-{
+-	char *buf;
+-	size_t size = seq_buf_get_buf(s, &buf);
+-	int res = -1;
+-
+-	WARN_ON(s->size == 0);
+-
+-	if (size) {
+-		char *p = d_path(path, buf, size);
+-		if (!IS_ERR(p)) {
+-			char *end = mangle_path(buf, p, esc);
+-			if (end)
+-				res = end - buf;
+-		}
+-	}
+-	seq_buf_commit(s, res);
+-
+-	return res;
+-}
+-
+-/**
+- * seq_buf_to_user - copy the sequence buffer to user space
+- * @s: seq_buf descriptor
+- * @ubuf: The userspace memory location to copy to
+- * @cnt: The amount to copy
+- *
+- * Copies the sequence buffer into the userspace memory pointed to
+- * by @ubuf. It starts from the last read position (@s->readpos)
+- * and writes up to @cnt characters or till it reaches the end of
+- * the content in the buffer (@s->len), which ever comes first.
+- *
+- * On success, it returns a positive number of the number of bytes
+- * it copied.
+- *
+- * On failure it returns -EBUSY if all of the content in the
+- * sequence has been already read, which includes nothing in the
+- * sequence (@s->len == @s->readpos).
+- *
+- * Returns -EFAULT if the copy to userspace fails.
+- */
+-int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt)
+-{
+-	int len;
+-	int ret;
+-
+-	if (!cnt)
+-		return 0;
+-
+-	len = seq_buf_used(s);
+-
+-	if (len <= s->readpos)
+-		return -EBUSY;
+-
+-	len -= s->readpos;
+-	if (cnt > len)
+-		cnt = len;
+-	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+-	if (ret == cnt)
+-		return -EFAULT;
+-
+-	cnt -= ret;
+-
+-	s->readpos += cnt;
+-	return cnt;
+-}
+-
+-/**
+- * seq_buf_hex_dump - print formatted hex dump into the sequence buffer
+- * @s: seq_buf descriptor
+- * @prefix_str: string to prefix each line with;
+- *  caller supplies trailing spaces for alignment if desired
+- * @prefix_type: controls whether prefix of an offset, address, or none
+- *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
+- * @rowsize: number of bytes to print per line; must be 16 or 32
+- * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1)
+- * @buf: data blob to dump
+- * @len: number of bytes in the @buf
+- * @ascii: include ASCII after the hex output
+- *
+- * Function is an analogue of print_hex_dump() and thus has similar interface.
+- *
+- * linebuf size is maximal length for one line.
+- * 32 * 3 - maximum bytes per line, each printed into 2 chars + 1 for
+- *	separating space
+- * 2 - spaces separating hex dump and ascii representation
+- * 32 - ascii representation
+- * 1 - terminating '\0'
+- *
+- * Returns zero on success, -1 on overflow
+- */
+-int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type,
+-		     int rowsize, int groupsize,
+-		     const void *buf, size_t len, bool ascii)
+-{
+-	const u8 *ptr = buf;
+-	int i, linelen, remaining = len;
+-	unsigned char linebuf[32 * 3 + 2 + 32 + 1];
+-	int ret;
+-
+-	if (rowsize != 16 && rowsize != 32)
+-		rowsize = 16;
+-
+-	for (i = 0; i < len; i += rowsize) {
+-		linelen = min(remaining, rowsize);
+-		remaining -= rowsize;
+-
+-		hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+-				   linebuf, sizeof(linebuf), ascii);
+-
+-		switch (prefix_type) {
+-		case DUMP_PREFIX_ADDRESS:
+-			ret = seq_buf_printf(s, "%s%p: %s\n",
+-			       prefix_str, ptr + i, linebuf);
+-			break;
+-		case DUMP_PREFIX_OFFSET:
+-			ret = seq_buf_printf(s, "%s%.8x: %s\n",
+-					     prefix_str, i, linebuf);
+-			break;
+-		default:
+-			ret = seq_buf_printf(s, "%s%s\n", prefix_str, linebuf);
+-			break;
+-		}
+-		if (ret)
+-			return ret;
+-	}
+-	return 0;
+-}
+diff --git a/lib/string_helpers.c b/lib/string_helpers.c
+index 5ed3beb066e6..d247bf945f16 100644
+--- a/lib/string_helpers.c
++++ b/lib/string_helpers.c
+@@ -15,6 +15,7 @@
+ #include <linux/fs.h>
+ #include <linux/limits.h>
+ #include <linux/mm.h>
++#include <linux/printbuf.h>
+ #include <linux/slab.h>
+ #include <linux/string.h>
+ #include <linux/string_helpers.h>
+@@ -32,8 +33,8 @@
+  * at least 9 bytes and will always be zero terminated.
+  *
+  */
+-void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
+-		     char *buf, int len)
++int string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
++		    char *buf, int len)
+ {
+ 	static const char *const units_10[] = {
+ 		"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+@@ -126,8 +127,7 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
+ 	else
+ 		unit = units_str[units][i];
+ 
+-	snprintf(buf, len, "%u%s %s", (u32)size,
+-		 tmp, unit);
++	return snprintf(buf, len, "%u%s %s", (u32)size, tmp, unit);
+ }
+ EXPORT_SYMBOL(string_get_size);
+ 
+@@ -301,19 +301,14 @@ int string_unescape(char *src, char *dst, size_t size, unsigned int flags)
+ }
+ EXPORT_SYMBOL(string_unescape);
+ 
+-static bool escape_passthrough(unsigned char c, char **dst, char *end)
++static bool escape_passthrough(struct printbuf *out, unsigned char c)
+ {
+-	char *out = *dst;
+-
+-	if (out < end)
+-		*out = c;
+-	*dst = out + 1;
++	prt_char(out, c);
+ 	return true;
+ }
+ 
+-static bool escape_space(unsigned char c, char **dst, char *end)
++static bool escape_space(struct printbuf *out, unsigned char c)
+ {
+-	char *out = *dst;
+ 	unsigned char to;
+ 
+ 	switch (c) {
+@@ -336,20 +331,13 @@ static bool escape_space(unsigned char c, char **dst, char *end)
+ 		return false;
+ 	}
+ 
+-	if (out < end)
+-		*out = '\\';
+-	++out;
+-	if (out < end)
+-		*out = to;
+-	++out;
+-
+-	*dst = out;
++	prt_char(out, '\\');
++	prt_char(out, to);
+ 	return true;
+ }
+ 
+-static bool escape_special(unsigned char c, char **dst, char *end)
++static bool escape_special(struct printbuf *out, unsigned char c)
+ {
+-	char *out = *dst;
+ 	unsigned char to;
+ 
+ 	switch (c) {
+@@ -369,83 +357,43 @@ static bool escape_special(unsigned char c, char **dst, char *end)
+ 		return false;
+ 	}
+ 
+-	if (out < end)
+-		*out = '\\';
+-	++out;
+-	if (out < end)
+-		*out = to;
+-	++out;
+-
+-	*dst = out;
++	prt_char(out, '\\');
++	prt_char(out, to);
+ 	return true;
+ }
+ 
+-static bool escape_null(unsigned char c, char **dst, char *end)
++static bool escape_null(struct printbuf *out, unsigned char c)
+ {
+-	char *out = *dst;
+-
+ 	if (c)
+ 		return false;
+ 
+-	if (out < end)
+-		*out = '\\';
+-	++out;
+-	if (out < end)
+-		*out = '0';
+-	++out;
+-
+-	*dst = out;
++	prt_char(out, '\\');
++	prt_char(out, '0');
+ 	return true;
+ }
+ 
+-static bool escape_octal(unsigned char c, char **dst, char *end)
++static bool escape_octal(struct printbuf *out, unsigned char c)
+ {
+-	char *out = *dst;
+-
+-	if (out < end)
+-		*out = '\\';
+-	++out;
+-	if (out < end)
+-		*out = ((c >> 6) & 0x07) + '0';
+-	++out;
+-	if (out < end)
+-		*out = ((c >> 3) & 0x07) + '0';
+-	++out;
+-	if (out < end)
+-		*out = ((c >> 0) & 0x07) + '0';
+-	++out;
+-
+-	*dst = out;
++	prt_char(out, '\\');
++	prt_char(out, ((c >> 6) & 0x07) + '0');
++	prt_char(out, ((c >> 3) & 0x07) + '0');
++	prt_char(out, ((c >> 0) & 0x07) + '0');
+ 	return true;
+ }
+ 
+-static bool escape_hex(unsigned char c, char **dst, char *end)
++static bool escape_hex(struct printbuf *out, unsigned char c)
+ {
+-	char *out = *dst;
+-
+-	if (out < end)
+-		*out = '\\';
+-	++out;
+-	if (out < end)
+-		*out = 'x';
+-	++out;
+-	if (out < end)
+-		*out = hex_asc_hi(c);
+-	++out;
+-	if (out < end)
+-		*out = hex_asc_lo(c);
+-	++out;
+-
+-	*dst = out;
++	prt_char(out, '\\');
++	prt_char(out, 'x');
++	prt_hex_byte(out, c);
+ 	return true;
+ }
+ 
+ /**
+- * string_escape_mem - quote characters in the given memory buffer
++ * prt_escaped_string - quote characters in the given memory buffer
++ * @out:	printbuf to output to (escaped)
+  * @src:	source buffer (unescaped)
+  * @isz:	source buffer size
+- * @dst:	destination buffer (escaped)
+- * @osz:	destination buffer size
+  * @flags:	combination of the flags
+  * @only:	NULL-terminated string containing characters used to limit
+  *		the selected escape class. If characters are included in @only
+@@ -510,18 +458,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
+  * or %ESCAPE_HEX, because they cover most of the other character classes.
+  * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to
+  * the above.
+- *
+- * Return:
+- * The total size of the escaped output that would be generated for
+- * the given input and flags. To check whether the output was
+- * truncated, compare the return value to osz. There is room left in
+- * dst for a '\0' terminator if and only if ret < osz.
+  */
+-int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
+-		      unsigned int flags, const char *only)
++void prt_escaped_string(struct printbuf *out,
++			const char *src, size_t isz,
++			unsigned int flags, const char *only)
+ {
+-	char *p = dst;
+-	char *end = p + osz;
+ 	bool is_dict = only && *only;
+ 	bool is_append = flags & ESCAPE_APPEND;
+ 
+@@ -549,41 +490,126 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
+ 		 * %ESCAPE_NA cases.
+ 		 */
+ 		if (!(is_append || in_dict) && is_dict &&
+-					  escape_passthrough(c, &p, end))
++		    escape_passthrough(out, c))
+ 			continue;
+ 
+ 		if (!(is_append && in_dict) && isascii(c) && isprint(c) &&
+-		    flags & ESCAPE_NAP && escape_passthrough(c, &p, end))
++		    flags & ESCAPE_NAP && escape_passthrough(out, c))
+ 			continue;
+ 
+ 		if (!(is_append && in_dict) && isprint(c) &&
+-		    flags & ESCAPE_NP && escape_passthrough(c, &p, end))
++		    flags & ESCAPE_NP && escape_passthrough(out, c))
+ 			continue;
+ 
+ 		if (!(is_append && in_dict) && isascii(c) &&
+-		    flags & ESCAPE_NA && escape_passthrough(c, &p, end))
++		    flags & ESCAPE_NA && escape_passthrough(out, c))
+ 			continue;
+ 
+-		if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
++		if (flags & ESCAPE_SPACE && escape_space(out, c))
+ 			continue;
+ 
+-		if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end))
++		if (flags & ESCAPE_SPECIAL && escape_special(out, c))
+ 			continue;
+ 
+-		if (flags & ESCAPE_NULL && escape_null(c, &p, end))
++		if (flags & ESCAPE_NULL && escape_null(out, c))
+ 			continue;
+ 
+ 		/* ESCAPE_OCTAL and ESCAPE_HEX always go last */
+-		if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end))
++		if (flags & ESCAPE_OCTAL && escape_octal(out, c))
+ 			continue;
+ 
+-		if (flags & ESCAPE_HEX && escape_hex(c, &p, end))
++		if (flags & ESCAPE_HEX && escape_hex(out, c))
+ 			continue;
+ 
+-		escape_passthrough(c, &p, end);
++		escape_passthrough(out, c);
+ 	}
++}
++EXPORT_SYMBOL(prt_escaped_string);
++
++/**
++ * string_escape_mem - quote characters in the given memory buffer
++ * @src:	source buffer (unescaped)
++ * @isz:	source buffer size
++ * @dst:	destination buffer (escaped)
++ * @osz:	destination buffer size
++ * @flags:	combination of the flags
++ * @only:	NULL-terminated string containing characters used to limit
++ *		the selected escape class. If characters are included in @only
++ *		that would not normally be escaped by the classes selected
++ *		in @flags, they will be copied to @dst unescaped.
++ *
++ * Description:
++ * The process of escaping byte buffer includes several parts. They are applied
++ * in the following sequence.
++ *
++ *	1. The character is not matched to the one from @only string and thus
++ *	   must go as-is to the output.
++ *	2. The character is matched to the printable and ASCII classes, if asked,
++ *	   and in case of match it passes through to the output.
++ *	3. The character is matched to the printable or ASCII class, if asked,
++ *	   and in case of match it passes through to the output.
++ *	4. The character is checked if it falls into the class given by @flags.
++ *	   %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
++ *	   character. Note that they actually can't go together, otherwise
++ *	   %ESCAPE_HEX will be ignored.
++ *
++ * Caller must provide valid source and destination pointers. Be aware that
++ * destination buffer will not be NULL-terminated, thus caller have to append
++ * it if needs. The supported flags are::
++ *
++ *	%ESCAPE_SPACE: (special white space, not space itself)
++ *		'\f' - form feed
++ *		'\n' - new line
++ *		'\r' - carriage return
++ *		'\t' - horizontal tab
++ *		'\v' - vertical tab
++ *	%ESCAPE_SPECIAL:
++ *		'\"' - double quote
++ *		'\\' - backslash
++ *		'\a' - alert (BEL)
++ *		'\e' - escape
++ *	%ESCAPE_NULL:
++ *		'\0' - null
++ *	%ESCAPE_OCTAL:
++ *		'\NNN' - byte with octal value NNN (3 digits)
++ *	%ESCAPE_ANY:
++ *		all previous together
++ *	%ESCAPE_NP:
++ *		escape only non-printable characters, checked by isprint()
++ *	%ESCAPE_ANY_NP:
++ *		all previous together
++ *	%ESCAPE_HEX:
++ *		'\xHH' - byte with hexadecimal value HH (2 digits)
++ *	%ESCAPE_NA:
++ *		escape only non-ascii characters, checked by isascii()
++ *	%ESCAPE_NAP:
++ *		escape only non-printable or non-ascii characters
++ *	%ESCAPE_APPEND:
++ *		append characters from @only to be escaped by the given classes
++ *
++ * %ESCAPE_APPEND would help to pass additional characters to the escaped, when
++ * one of %ESCAPE_NP, %ESCAPE_NA, or %ESCAPE_NAP is provided.
++ *
++ * One notable caveat, the %ESCAPE_NAP, %ESCAPE_NP and %ESCAPE_NA have the
++ * higher priority than the rest of the flags (%ESCAPE_NAP is the highest).
++ * It doesn't make much sense to use either of them without %ESCAPE_OCTAL
++ * or %ESCAPE_HEX, because they cover most of the other character classes.
++ * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to
++ * the above.
++ *
++ * Return:
++ * The total size of the escaped output that would be generated for
++ * the given input and flags. To check whether the output was
++ * truncated, compare the return value to osz. There is room left in
++ * dst for a '\0' terminator if and only if ret < osz.
++ */
++int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
++		      unsigned int flags, const char *only)
++{
++	struct printbuf out = PRINTBUF_EXTERN(dst, osz);
+ 
+-	return p - dst;
++	prt_escaped_string(&out, src, isz, flags, only);
++	return out.pos;
+ }
+ EXPORT_SYMBOL(string_escape_mem);
+ 
+diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c
+index 5144899d3c6b..f9e97879dcdf 100644
+--- a/lib/test_hexdump.c
++++ b/lib/test_hexdump.c
+@@ -25,36 +25,19 @@ static const char * const test_data_1[] __initconst = {
+ 	"4c", "d1", "19", "99", "43", "b1", "af", "0c",
+ };
+ 
+-static const char * const test_data_2_le[] __initconst = {
+-	"32be", "7bdb", "180a", "b293",
+-	"ba70", "24c4", "837d", "9b34",
+-	"9ca6", "ad31", "0f9c", "e9ac",
+-	"d14c", "9919", "b143", "0caf",
+-};
+-
+-static const char * const test_data_2_be[] __initconst = {
++static const char * const test_data_2[] __initconst = {
+ 	"be32", "db7b", "0a18", "93b2",
+ 	"70ba", "c424", "7d83", "349b",
+ 	"a69c", "31ad", "9c0f", "ace9",
+ 	"4cd1", "1999", "43b1", "af0c",
+ };
+ 
+-static const char * const test_data_4_le[] __initconst = {
+-	"7bdb32be", "b293180a", "24c4ba70", "9b34837d",
+-	"ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143",
+-};
+-
+-static const char * const test_data_4_be[] __initconst = {
++static const char * const test_data_4[] __initconst = {
+ 	"be32db7b", "0a1893b2", "70bac424", "7d83349b",
+ 	"a69c31ad", "9c0face9", "4cd11999", "43b1af0c",
+ };
+ 
+-static const char * const test_data_8_le[] __initconst = {
+-	"b293180a7bdb32be", "9b34837d24c4ba70",
+-	"e9ac0f9cad319ca6", "0cafb1439919d14c",
+-};
+-
+-static const char * const test_data_8_be[] __initconst = {
++static const char * const test_data_8[] __initconst = {
+ 	"be32db7b0a1893b2", "70bac4247d83349b",
+ 	"a69c31ad9c0face9", "4cd1199943b1af0c",
+ };
+@@ -73,7 +56,6 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize,
+ 	size_t l = len;
+ 	int gs = groupsize, rs = rowsize;
+ 	unsigned int i;
+-	const bool is_be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN);
+ 
+ 	if (rs != 16 && rs != 32)
+ 		rs = 16;
+@@ -85,11 +67,11 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize,
+ 		gs = 1;
+ 
+ 	if (gs == 8)
+-		result = is_be ? test_data_8_be : test_data_8_le;
++		result = test_data_8;
+ 	else if (gs == 4)
+-		result = is_be ? test_data_4_be : test_data_4_le;
++		result = test_data_4;
+ 	else if (gs == 2)
+-		result = is_be ? test_data_2_be : test_data_2_le;
++		result = test_data_2;
+ 	else
+ 		result = test_data_1;
+ 
+diff --git a/lib/test_printf.c b/lib/test_printf.c
+index 07309c45f327..ac5f9f0eb4e0 100644
+--- a/lib/test_printf.c
++++ b/lib/test_printf.c
+@@ -9,6 +9,7 @@
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ #include <linux/printk.h>
++#include <linux/printbuf.h>
+ #include <linux/random.h>
+ #include <linux/rtc.h>
+ #include <linux/slab.h>
+@@ -78,12 +79,6 @@ do_test(int bufsize, const char *expect, int elen,
+ 		return 1;
+ 	}
+ 
+-	if (memchr_inv(test_buffer + written + 1, FILL_CHAR, BUF_SIZE + PAD_SIZE - (written + 1))) {
+-		pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote beyond the nul-terminator\n",
+-			bufsize, fmt);
+-		return 1;
+-	}
+-
+ 	if (memcmp(test_buffer, expect, written)) {
+ 		pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n",
+ 			bufsize, fmt, test_buffer, written, expect);
+@@ -783,6 +778,31 @@ test_pointer(void)
+ 	fourcc_pointer();
+ }
+ 
++static void printf_test_fn_0(struct printbuf *out)
++{
++	prt_str(out, "0");
++}
++
++static void printf_test_fn_1(struct printbuf *out, void *p)
++{
++	int *i = p;
++
++	prt_printf(out, "%i", *i);
++}
++
++static void __init
++test_fn(void)
++{
++	int i = 1;
++
++	test("0", "%pf()",   CALL_PP(printf_test_fn_0));
++	test("1", "%pf(%p)", CALL_PP(printf_test_fn_1, &i));
++	/*
++	 * Not tested, so we don't fail the build with -Werror:
++	 */
++	//test("1", "%(%p)", printf_test_fn, &i);
++}
++
+ static void __init selftest(void)
+ {
+ 	alloced_buffer = kmalloc(BUF_SIZE + 2*PAD_SIZE, GFP_KERNEL);
+@@ -794,6 +814,7 @@ static void __init selftest(void)
+ 	test_number();
+ 	test_string();
+ 	test_pointer();
++	test_fn();
+ 
+ 	kfree(alloced_buffer);
+ }
+diff --git a/lib/vsprintf.c b/lib/vsprintf.c
+index 3c1853a9d1c0..d92a212db2f5 100644
+--- a/lib/vsprintf.c
++++ b/lib/vsprintf.c
+@@ -44,6 +44,7 @@
+ #ifdef CONFIG_BLOCK
+ #include <linux/blkdev.h>
+ #endif
++#include <linux/printbuf.h>
+ 
+ #include "../mm/internal.h"	/* For the trace_print_flags arrays */
+ 
+@@ -52,6 +53,7 @@
+ #include <asm/unaligned.h>
+ 
+ #include <linux/string_helpers.h>
++#include <linux/pretty-printers.h>
+ #include "kstrtox.h"
+ 
+ /* Disable pointer hashing if requested */
+@@ -367,41 +369,51 @@ char *put_dec(char *buf, unsigned long long n)
+ 
+ #endif
+ 
+-/*
+- * Convert passed number to decimal string.
+- * Returns the length of string.  On buffer overflow, returns 0.
+- *
+- * If speed is not important, use snprintf(). It's easy to read the code.
++/**
++ * prt_u64_minwidth - print a u64, in decimal, with zero padding
++ * @out: printbuf to output to
++ * @num: u64 to print
++ * @width: minimum width
+  */
+-int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
++void prt_u64_minwidth(struct printbuf *out, u64 num, unsigned width)
+ {
+ 	/* put_dec requires 2-byte alignment of the buffer. */
+ 	char tmp[sizeof(num) * 3] __aligned(2);
+-	int idx, len;
++	unsigned len = put_dec(tmp, num) - tmp;
+ 
+-	/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
+-	if (num <= 9) {
+-		tmp[0] = '0' + num;
+-		len = 1;
+-	} else {
+-		len = put_dec(tmp, num) - tmp;
+-	}
++	printbuf_make_room(out, max(len, width));
+ 
+-	if (len > size || width > size)
+-		return 0;
++	if (width > len)
++		__prt_chars_reserved(out, '0', width - len);
+ 
+-	if (width > len) {
+-		width = width - len;
+-		for (idx = 0; idx < width; idx++)
+-			buf[idx] = ' ';
+-	} else {
+-		width = 0;
+-	}
++	while (len)
++		__prt_char_reserved(out, tmp[--len]);
++	printbuf_nul_terminate(out);
++}
+ 
+-	for (idx = 0; idx < len; ++idx)
+-		buf[idx + width] = tmp[len - idx - 1];
++/**
++ * prt_u64 - print a simple u64, in decimal
++ * @out: printbuf to output to
++ * @num: u64 to print
++ */
++void prt_u64(struct printbuf *out, u64 num)
++{
++	prt_u64_minwidth(out, num, 0);
++}
++
++/*
++ * Convert passed number to decimal string.
++ * Returns the length of string.  On buffer overflow, returns 0.
++ *
++ * Consider switching to printbufs and using prt_u64() or prt_u64_minwith()
++ * instead.
++ */
++int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
++{
++	struct printbuf out = PRINTBUF_EXTERN(buf, size);
+ 
+-	return len + width;
++	prt_u64_minwidth(&out, num, width);
++	return out.pos;
+ }
+ 
+ #define SIGN	1		/* unsigned/signed, must be 1 */
+@@ -435,7 +447,8 @@ enum format_type {
+ 	FORMAT_TYPE_UINT,
+ 	FORMAT_TYPE_INT,
+ 	FORMAT_TYPE_SIZE_T,
+-	FORMAT_TYPE_PTRDIFF
++	FORMAT_TYPE_PTRDIFF,
++	FORMAT_TYPE_FN,
+ };
+ 
+ struct printf_spec {
+@@ -451,128 +464,103 @@ static_assert(sizeof(struct printf_spec) == 8);
+ #define PRECISION_MAX ((1 << 15) - 1)
+ 
+ static noinline_for_stack
+-char *number(char *buf, char *end, unsigned long long num,
+-	     struct printf_spec spec)
++void number(struct printbuf *out, unsigned long long num,
++	    struct printf_spec spec)
+ {
+ 	/* put_dec requires 2-byte alignment of the buffer. */
+ 	char tmp[3 * sizeof(num)] __aligned(2);
+-	char sign;
+-	char locase;
++	char sign = 0;
++	/* locase = 0 or 0x20. ORing digits or letters with 'locase'
++	 * produces same digits or (maybe lowercased) letters */
++	char locase = (spec.flags & SMALL);
+ 	int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
+-	int i;
+ 	bool is_zero = num == 0LL;
+ 	int field_width = spec.field_width;
+ 	int precision = spec.precision;
++	int nr_digits = 0;
++	int output_bytes = 0;
+ 
+-	/* locase = 0 or 0x20. ORing digits or letters with 'locase'
+-	 * produces same digits or (maybe lowercased) letters */
+-	locase = (spec.flags & SMALL);
+ 	if (spec.flags & LEFT)
+ 		spec.flags &= ~ZEROPAD;
+-	sign = 0;
+ 	if (spec.flags & SIGN) {
+ 		if ((signed long long)num < 0) {
+ 			sign = '-';
+ 			num = -(signed long long)num;
+-			field_width--;
++			output_bytes++;
+ 		} else if (spec.flags & PLUS) {
+ 			sign = '+';
+-			field_width--;
++			output_bytes++;
+ 		} else if (spec.flags & SPACE) {
+ 			sign = ' ';
+-			field_width--;
++			output_bytes++;
+ 		}
+ 	}
+ 	if (need_pfx) {
+ 		if (spec.base == 16)
+-			field_width -= 2;
++			output_bytes += 2;
+ 		else if (!is_zero)
+-			field_width--;
++			output_bytes++;
+ 	}
+ 
+ 	/* generate full string in tmp[], in reverse order */
+-	i = 0;
+-	if (num < spec.base)
+-		tmp[i++] = hex_asc_upper[num] | locase;
+-	else if (spec.base != 10) { /* 8 or 16 */
++	if (spec.base == 10) {
++		nr_digits = put_dec(tmp, num) - tmp;
++	} else { /* 8 or 16 */
+ 		int mask = spec.base - 1;
+-		int shift = 3;
++		int shift = ilog2((unsigned) spec.base);
+ 
+-		if (spec.base == 16)
+-			shift = 4;
+ 		do {
+-			tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
++			tmp[nr_digits++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
+ 			num >>= shift;
+ 		} while (num);
+-	} else { /* base 10 */
+-		i = put_dec(tmp, num) - tmp;
+ 	}
+ 
+ 	/* printing 100 using %2d gives "100", not "00" */
+-	if (i > precision)
+-		precision = i;
++	precision = max(nr_digits, precision);
++	output_bytes += precision;
++	field_width = max(0, field_width - output_bytes);
++
++	printbuf_make_room(out, field_width + output_bytes);
++
+ 	/* leading space padding */
+-	field_width -= precision;
+-	if (!(spec.flags & (ZEROPAD | LEFT))) {
+-		while (--field_width >= 0) {
+-			if (buf < end)
+-				*buf = ' ';
+-			++buf;
+-		}
++	if (!(spec.flags & (ZEROPAD | LEFT)) && field_width) {
++		__prt_chars_reserved(out, ' ', field_width);
++		field_width = 0;
+ 	}
++
+ 	/* sign */
+-	if (sign) {
+-		if (buf < end)
+-			*buf = sign;
+-		++buf;
+-	}
++	if (sign)
++		__prt_char_reserved(out, sign);
++
+ 	/* "0x" / "0" prefix */
+ 	if (need_pfx) {
+-		if (spec.base == 16 || !is_zero) {
+-			if (buf < end)
+-				*buf = '0';
+-			++buf;
+-		}
+-		if (spec.base == 16) {
+-			if (buf < end)
+-				*buf = ('X' | locase);
+-			++buf;
+-		}
++		if (spec.base == 16 || !is_zero)
++			__prt_char_reserved(out, '0');
++		if (spec.base == 16)
++			__prt_char_reserved(out, 'X' | locase);
+ 	}
+-	/* zero or space padding */
+-	if (!(spec.flags & LEFT)) {
+-		char c = ' ' + (spec.flags & ZEROPAD);
+ 
+-		while (--field_width >= 0) {
+-			if (buf < end)
+-				*buf = c;
+-			++buf;
+-		}
+-	}
+-	/* hmm even more zero padding? */
+-	while (i <= --precision) {
+-		if (buf < end)
+-			*buf = '0';
+-		++buf;
+-	}
++	/* zero padding */
++	if (!(spec.flags & LEFT) && field_width)
++		__prt_chars_reserved(out, '0', field_width);
++
++	/* zero padding from precision */
++	if (precision > nr_digits)
++		__prt_chars_reserved(out, '0', precision - nr_digits);
++
+ 	/* actual digits of result */
+-	while (--i >= 0) {
+-		if (buf < end)
+-			*buf = tmp[i];
+-		++buf;
+-	}
++	while (--nr_digits >= 0)
++		__prt_char_reserved(out, tmp[nr_digits]);
++
+ 	/* trailing space padding */
+-	while (--field_width >= 0) {
+-		if (buf < end)
+-			*buf = ' ';
+-		++buf;
+-	}
++	if ((spec.flags & LEFT) && field_width)
++		__prt_chars_reserved(out, ' ', field_width);
+ 
+-	return buf;
++	printbuf_nul_terminate(out);
+ }
+ 
+ static noinline_for_stack
+-char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
++void special_hex_number(struct printbuf *out, unsigned long long num, int size)
+ {
+ 	struct printf_spec spec;
+ 
+@@ -582,25 +570,28 @@ char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
+ 	spec.base = 16;
+ 	spec.precision = -1;
+ 
+-	return number(buf, end, num, spec);
++	number(out, num, spec);
+ }
+ 
+-static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
++/*
++ * inserts @spaces spaces @len from the end of @out
++ */
++static void move_right(struct printbuf *out,
++		       unsigned len, unsigned spaces)
+ {
+-	size_t size;
+-	if (buf >= end)	/* nowhere to put anything */
+-		return;
+-	size = end - buf;
+-	if (size <= spaces) {
+-		memset(buf, ' ', size);
+-		return;
+-	}
+-	if (len) {
+-		if (len > size - spaces)
+-			len = size - spaces;
+-		memmove(buf + spaces, buf, len);
+-	}
+-	memset(buf, ' ', spaces);
++	unsigned move_src = out->pos - len;
++	unsigned move_dst = move_src + spaces;
++	unsigned remaining_from_dst = move_dst < out->size ? out->size - move_dst : 0;
++	unsigned remaining_from_src = move_src < out->size ? out->size - move_src : 0;
++
++	BUG_ON(len > out->pos);
++
++	memmove(out->buf + move_dst,
++		out->buf + move_src,
++		min(remaining_from_dst, len));
++	memset(out->buf + move_src, ' ',
++	       min(remaining_from_src, spaces));
++	out->pos += spaces;
+ }
+ 
+ /*
+@@ -612,67 +603,68 @@ static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
+  * Returns: new buffer position after padding.
+  */
+ static noinline_for_stack
+-char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
++void widen_string(struct printbuf *out, int n,
++		  struct printf_spec spec)
+ {
+ 	unsigned spaces;
+ 
+ 	if (likely(n >= spec.field_width))
+-		return buf;
++		return;
+ 	/* we want to pad the sucker */
+ 	spaces = spec.field_width - n;
+-	if (!(spec.flags & LEFT)) {
+-		move_right(buf - n, end, n, spaces);
+-		return buf + spaces;
+-	}
+-	while (spaces--) {
+-		if (buf < end)
+-			*buf = ' ';
+-		++buf;
++	if (!(spec.flags & LEFT))
++		move_right(out, n, spaces);
++	else
++		prt_chars(out, ' ', spaces);
++}
++
++static void do_width_precision(struct printbuf *out, unsigned prev_pos,
++			       struct printf_spec spec)
++{
++	unsigned n = out->pos - prev_pos;
++
++	if (n > spec.precision) {
++		out->pos -= n - spec.precision;
++		n = spec.precision;
+ 	}
+-	return buf;
++
++	widen_string(out, n, spec);
+ }
+ 
+ /* Handle string from a well known address. */
+-static char *string_nocheck(char *buf, char *end, const char *s,
+-			    struct printf_spec spec)
++static void string_nocheck(struct printbuf *out,
++			   const char *s,
++			   struct printf_spec spec)
+ {
+-	int len = 0;
+-	int lim = spec.precision;
++	int len = strnlen(s, spec.precision);
+ 
+-	while (lim--) {
+-		char c = *s++;
+-		if (!c)
+-			break;
+-		if (buf < end)
+-			*buf = c;
+-		++buf;
+-		++len;
+-	}
+-	return widen_string(buf, len, end, spec);
++	prt_bytes(out, s, len);
++	widen_string(out, len, spec);
+ }
+ 
+-static char *err_ptr(char *buf, char *end, void *ptr,
+-		     struct printf_spec spec)
++static void err_ptr(struct printbuf *out, void *ptr,
++		    struct printf_spec spec)
+ {
+ 	int err = PTR_ERR(ptr);
+ 	const char *sym = errname(err);
+ 
+-	if (sym)
+-		return string_nocheck(buf, end, sym, spec);
+-
+-	/*
+-	 * Somebody passed ERR_PTR(-1234) or some other non-existing
+-	 * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to
+-	 * printing it as its decimal representation.
+-	 */
+-	spec.flags |= SIGN;
+-	spec.base = 10;
+-	return number(buf, end, err, spec);
++	if (sym) {
++		string_nocheck(out, sym, spec);
++	} else {
++		/*
++		 * Somebody passed ERR_PTR(-1234) or some other non-existing
++		 * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to
++		 * printing it as its decimal representation.
++		 */
++		spec.flags |= SIGN;
++		spec.base = 10;
++		number(out, err, spec);
++	}
+ }
+ 
+ /* Be careful: error messages must fit into the given buffer. */
+-static char *error_string(char *buf, char *end, const char *s,
+-			  struct printf_spec spec)
++static void error_string_spec(struct printbuf *out, const char *s,
++			 struct printf_spec spec)
+ {
+ 	/*
+ 	 * Hard limit to avoid a completely insane messages. It actually
+@@ -682,7 +674,7 @@ static char *error_string(char *buf, char *end, const char *s,
+ 	if (spec.precision == -1)
+ 		spec.precision = 2 * sizeof(void *);
+ 
+-	return string_nocheck(buf, end, s, spec);
++	string_nocheck(out, s, spec);
+ }
+ 
+ /*
+@@ -701,14 +693,15 @@ static const char *check_pointer_msg(const void *ptr)
+ 	return NULL;
+ }
+ 
+-static int check_pointer(char **buf, char *end, const void *ptr,
++static int check_pointer_spec(struct printbuf *out,
++			 const void *ptr,
+ 			 struct printf_spec spec)
+ {
+ 	const char *err_msg;
+ 
+ 	err_msg = check_pointer_msg(ptr);
+ 	if (err_msg) {
+-		*buf = error_string(*buf, end, err_msg, spec);
++		error_string_spec(out, err_msg, spec);
+ 		return -EFAULT;
+ 	}
+ 
+@@ -716,18 +709,50 @@ static int check_pointer(char **buf, char *end, const void *ptr,
+ }
+ 
+ static noinline_for_stack
+-char *string(char *buf, char *end, const char *s,
+-	     struct printf_spec spec)
++void string_spec(struct printbuf *out,
++	    const char *s,
++	    struct printf_spec spec)
+ {
+-	if (check_pointer(&buf, end, s, spec))
+-		return buf;
++	if (check_pointer_spec(out, s, spec))
++		return;
+ 
+-	return string_nocheck(buf, end, s, spec);
++	string_nocheck(out, s, spec);
+ }
+ 
+-static char *pointer_string(char *buf, char *end,
+-			    const void *ptr,
+-			    struct printf_spec spec)
++static void error_string(struct printbuf *out, const char *s)
++{
++	/*
++	 * Hard limit to avoid a completely insane messages. It actually
++	 * works pretty well because most error messages are in
++	 * the many pointer format modifiers.
++	 */
++	prt_bytes(out, s, min(strlen(s), 2 * sizeof(void *)));
++}
++
++static int check_pointer(struct printbuf *out, const void *ptr)
++{
++	const char *err_msg;
++
++	err_msg = check_pointer_msg(ptr);
++	if (err_msg) {
++		error_string(out, err_msg);
++		return -EFAULT;
++	}
++
++	return 0;
++}
++
++static void string(struct printbuf *out, const char *s)
++{
++	if (check_pointer(out, s))
++		return;
++
++	prt_str(out, s);
++}
++
++static void pointer_string(struct printbuf *out,
++			   const void *ptr,
++			   struct printf_spec spec)
+ {
+ 	spec.base = 16;
+ 	spec.flags |= SMALL;
+@@ -736,7 +761,7 @@ static char *pointer_string(char *buf, char *end,
+ 		spec.flags |= ZEROPAD;
+ 	}
+ 
+-	return number(buf, end, (unsigned long int)ptr, spec);
++	number(out, (unsigned long int)ptr, spec);
+ }
+ 
+ /* Make pointers available for printing early in the boot sequence. */
+@@ -801,8 +826,9 @@ int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
+ 	return __ptr_to_hashval(ptr, hashval_out);
+ }
+ 
+-static char *ptr_to_id(char *buf, char *end, const void *ptr,
+-		       struct printf_spec spec)
++static void ptr_to_id(struct printbuf *out,
++		      const void *ptr,
++		      struct printf_spec spec)
+ {
+ 	const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
+ 	unsigned long hashval;
+@@ -813,47 +839,49 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr,
+ 	 * as they are not actual addresses.
+ 	 */
+ 	if (IS_ERR_OR_NULL(ptr))
+-		return pointer_string(buf, end, ptr, spec);
++		return pointer_string(out, ptr, spec);
+ 
+ 	/* When debugging early boot use non-cryptographically secure hash. */
+ 	if (unlikely(debug_boot_weak_hash)) {
+ 		hashval = hash_long((unsigned long)ptr, 32);
+-		return pointer_string(buf, end, (const void *)hashval, spec);
++		return pointer_string(out, (const void *)hashval, spec);
+ 	}
+ 
+ 	ret = __ptr_to_hashval(ptr, &hashval);
+ 	if (ret) {
+ 		spec.field_width = 2 * sizeof(ptr);
+ 		/* string length must be less than default_width */
+-		return error_string(buf, end, str, spec);
++		return error_string_spec(out, str, spec);
+ 	}
+ 
+-	return pointer_string(buf, end, (const void *)hashval, spec);
++	pointer_string(out, (const void *)hashval, spec);
+ }
+ 
+-static char *default_pointer(char *buf, char *end, const void *ptr,
+-			     struct printf_spec spec)
++static void default_pointer(struct printbuf *out,
++			    const void *ptr,
++			    struct printf_spec spec)
+ {
+ 	/*
+ 	 * default is to _not_ leak addresses, so hash before printing,
+ 	 * unless no_hash_pointers is specified on the command line.
+ 	 */
+ 	if (unlikely(no_hash_pointers))
+-		return pointer_string(buf, end, ptr, spec);
++		return pointer_string(out, ptr, spec);
+ 
+-	return ptr_to_id(buf, end, ptr, spec);
++	return ptr_to_id(out, ptr, spec);
+ }
+ 
+ int kptr_restrict __read_mostly;
+ 
+ static noinline_for_stack
+-char *restricted_pointer(char *buf, char *end, const void *ptr,
+-			 struct printf_spec spec)
++void restricted_pointer(struct printbuf *out,
++			const void *ptr,
++			struct printf_spec spec)
+ {
+ 	switch (kptr_restrict) {
+ 	case 0:
+ 		/* Handle as %p, hash and do _not_ leak addresses. */
+-		return default_pointer(buf, end, ptr, spec);
++		return default_pointer(out, ptr, spec);
+ 	case 1: {
+ 		const struct cred *cred;
+ 
+@@ -864,7 +892,7 @@ char *restricted_pointer(char *buf, char *end, const void *ptr,
+ 		if (in_irq() || in_serving_softirq() || in_nmi()) {
+ 			if (spec.field_width == -1)
+ 				spec.field_width = 2 * sizeof(ptr);
+-			return error_string(buf, end, "pK-error", spec);
++			return error_string_spec(out, "pK-error", spec);
+ 		}
+ 
+ 		/*
+@@ -890,17 +918,16 @@ char *restricted_pointer(char *buf, char *end, const void *ptr,
+ 		break;
+ 	}
+ 
+-	return pointer_string(buf, end, ptr, spec);
++	return pointer_string(out, ptr, spec);
+ }
+ 
+ static noinline_for_stack
+-char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
+-		  const char *fmt)
++void dentry_name(struct printbuf *out, const struct dentry *d,
++		 const char *fmt)
+ {
+-	const char *array[4], *s;
++	const char *array[4];
+ 	const struct dentry *p;
+-	int depth;
+-	int i, n;
++	int i, depth;
+ 
+ 	switch (fmt[1]) {
+ 		case '2': case '3': case '4':
+@@ -912,9 +939,9 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
+ 
+ 	rcu_read_lock();
+ 	for (i = 0; i < depth; i++, d = p) {
+-		if (check_pointer(&buf, end, d, spec)) {
++		if (check_pointer(out, d)) {
+ 			rcu_read_unlock();
+-			return buf;
++			return;
+ 		}
+ 
+ 		p = READ_ONCE(d->d_parent);
+@@ -926,58 +953,46 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
+ 			break;
+ 		}
+ 	}
+-	s = array[--i];
+-	for (n = 0; n != spec.precision; n++, buf++) {
+-		char c = *s++;
+-		if (!c) {
+-			if (!i)
+-				break;
+-			c = '/';
+-			s = array[--i];
+-		}
+-		if (buf < end)
+-			*buf = c;
++	while (1) {
++		prt_str(out, array[--i]);
++		if (!i)
++			break;
++		prt_char(out, '/');
+ 	}
+ 	rcu_read_unlock();
+-	return widen_string(buf, n, end, spec);
+ }
+ 
+ static noinline_for_stack
+-char *file_dentry_name(char *buf, char *end, const struct file *f,
+-			struct printf_spec spec, const char *fmt)
++void file_dentry_name(struct printbuf *out, const struct file *f,
++		      const char *fmt)
+ {
+-	if (check_pointer(&buf, end, f, spec))
+-		return buf;
++	if (check_pointer(out, f))
++		return;
+ 
+-	return dentry_name(buf, end, f->f_path.dentry, spec, fmt);
++	return dentry_name(out, f->f_path.dentry, fmt);
+ }
+ #ifdef CONFIG_BLOCK
+ static noinline_for_stack
+-char *bdev_name(char *buf, char *end, struct block_device *bdev,
+-		struct printf_spec spec, const char *fmt)
++void bdev_name(struct printbuf *out, struct block_device *bdev)
+ {
+ 	struct gendisk *hd;
+ 
+-	if (check_pointer(&buf, end, bdev, spec))
+-		return buf;
++	if (check_pointer(out, bdev))
++		return;
+ 
+ 	hd = bdev->bd_disk;
+-	buf = string(buf, end, hd->disk_name, spec);
++	string(out, hd->disk_name);
+ 	if (bdev->bd_partno) {
+-		if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) {
+-			if (buf < end)
+-				*buf = 'p';
+-			buf++;
+-		}
+-		buf = number(buf, end, bdev->bd_partno, spec);
++		if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
++			prt_char(out, 'p');
++		prt_u64(out, bdev->bd_partno);
+ 	}
+-	return buf;
+ }
+ #endif
+ 
+ static noinline_for_stack
+-char *symbol_string(char *buf, char *end, void *ptr,
+-		    struct printf_spec spec, const char *fmt)
++void symbol_string(struct printbuf *out, void *ptr,
++		   const char *fmt)
+ {
+ 	unsigned long value;
+ #ifdef CONFIG_KALLSYMS
+@@ -1000,17 +1015,12 @@ char *symbol_string(char *buf, char *end, void *ptr,
+ 	else
+ 		sprint_symbol_no_offset(sym, value);
+ 
+-	return string_nocheck(buf, end, sym, spec);
++	prt_str(out, sym);
+ #else
+-	return special_hex_number(buf, end, value, sizeof(void *));
++	special_hex_number(out, value, sizeof(void *));
+ #endif
+ }
+ 
+-static const struct printf_spec default_str_spec = {
+-	.field_width = -1,
+-	.precision = -1,
+-};
+-
+ static const struct printf_spec default_flag_spec = {
+ 	.base = 16,
+ 	.precision = -1,
+@@ -1022,23 +1032,9 @@ static const struct printf_spec default_dec_spec = {
+ 	.precision = -1,
+ };
+ 
+-static const struct printf_spec default_dec02_spec = {
+-	.base = 10,
+-	.field_width = 2,
+-	.precision = -1,
+-	.flags = ZEROPAD,
+-};
+-
+-static const struct printf_spec default_dec04_spec = {
+-	.base = 10,
+-	.field_width = 4,
+-	.precision = -1,
+-	.flags = ZEROPAD,
+-};
+-
+ static noinline_for_stack
+-char *resource_string(char *buf, char *end, struct resource *res,
+-		      struct printf_spec spec, const char *fmt)
++void resource_string(struct printbuf *out, struct resource *res,
++		     int decode)
+ {
+ #ifndef IO_RSRC_PRINTK_SIZE
+ #define IO_RSRC_PRINTK_SIZE	6
+@@ -1077,80 +1073,79 @@ char *resource_string(char *buf, char *end, struct resource *res,
+ #define FLAG_BUF_SIZE		(2 * sizeof(res->flags))
+ #define DECODED_BUF_SIZE	sizeof("[mem - 64bit pref window disabled]")
+ #define RAW_BUF_SIZE		sizeof("[mem - flags 0x]")
+-	char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
+-		     2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];
+-
+-	char *p = sym, *pend = sym + sizeof(sym);
+-	int decode = (fmt[0] == 'R') ? 1 : 0;
+ 	const struct printf_spec *specp;
+ 
+-	if (check_pointer(&buf, end, res, spec))
+-		return buf;
++	if (check_pointer(out, res))
++		return;
+ 
+-	*p++ = '[';
++	prt_char(out, '[');
+ 	if (res->flags & IORESOURCE_IO) {
+-		p = string_nocheck(p, pend, "io  ", str_spec);
++		string_nocheck(out, "io  ", str_spec);
+ 		specp = &io_spec;
+ 	} else if (res->flags & IORESOURCE_MEM) {
+-		p = string_nocheck(p, pend, "mem ", str_spec);
++		string_nocheck(out, "mem ", str_spec);
+ 		specp = &mem_spec;
+ 	} else if (res->flags & IORESOURCE_IRQ) {
+-		p = string_nocheck(p, pend, "irq ", str_spec);
++		string_nocheck(out, "irq ", str_spec);
+ 		specp = &default_dec_spec;
+ 	} else if (res->flags & IORESOURCE_DMA) {
+-		p = string_nocheck(p, pend, "dma ", str_spec);
++		string_nocheck(out, "dma ", str_spec);
+ 		specp = &default_dec_spec;
+ 	} else if (res->flags & IORESOURCE_BUS) {
+-		p = string_nocheck(p, pend, "bus ", str_spec);
++		string_nocheck(out, "bus ", str_spec);
+ 		specp = &bus_spec;
+ 	} else {
+-		p = string_nocheck(p, pend, "??? ", str_spec);
++		string_nocheck(out, "??? ", str_spec);
+ 		specp = &mem_spec;
+ 		decode = 0;
+ 	}
+ 	if (decode && res->flags & IORESOURCE_UNSET) {
+-		p = string_nocheck(p, pend, "size ", str_spec);
+-		p = number(p, pend, resource_size(res), *specp);
++		string_nocheck(out, "size ", str_spec);
++		number(out, resource_size(res), *specp);
+ 	} else {
+-		p = number(p, pend, res->start, *specp);
++		number(out, res->start, *specp);
+ 		if (res->start != res->end) {
+-			*p++ = '-';
+-			p = number(p, pend, res->end, *specp);
++			prt_char(out, '-');
++			number(out, res->end, *specp);
+ 		}
+ 	}
+ 	if (decode) {
+ 		if (res->flags & IORESOURCE_MEM_64)
+-			p = string_nocheck(p, pend, " 64bit", str_spec);
++			string_nocheck(out, " 64bit", str_spec);
+ 		if (res->flags & IORESOURCE_PREFETCH)
+-			p = string_nocheck(p, pend, " pref", str_spec);
++			string_nocheck(out, " pref", str_spec);
+ 		if (res->flags & IORESOURCE_WINDOW)
+-			p = string_nocheck(p, pend, " window", str_spec);
++			string_nocheck(out, " window", str_spec);
+ 		if (res->flags & IORESOURCE_DISABLED)
+-			p = string_nocheck(p, pend, " disabled", str_spec);
++			string_nocheck(out, " disabled", str_spec);
+ 	} else {
+-		p = string_nocheck(p, pend, " flags ", str_spec);
+-		p = number(p, pend, res->flags, default_flag_spec);
++		string_nocheck(out, " flags ", str_spec);
++		number(out, res->flags, default_flag_spec);
+ 	}
+-	*p++ = ']';
+-	*p = '\0';
++	prt_char(out, ']');
+ 
+-	return string_nocheck(buf, end, sym, spec);
++	printbuf_nul_terminate(out);
+ }
+ 
+ static noinline_for_stack
+-char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
+-		 const char *fmt)
++void hex_string(struct printbuf *out, const u8 *addr,
++		int len, const char *fmt)
+ {
+-	int i, len = 1;		/* if we pass '%ph[CDN]', field width remains
+-				   negative value, fallback to the default */
+ 	char separator;
+ 
+-	if (spec.field_width == 0)
+-		/* nothing to print */
+-		return buf;
++	/* nothing to print */
++	if (len == 0)
++		return;
++
++	/* if we pass '%ph[CDN]', field width remains
++	   negative value, fallback to the default */
++	if (len < 0)
++		len = 1;
+ 
+-	if (check_pointer(&buf, end, addr, spec))
+-		return buf;
++	len = min(len, 64);
++
++	if (check_pointer(out, addr))
++		return;
+ 
+ 	switch (fmt[1]) {
+ 	case 'C':
+@@ -1167,41 +1162,21 @@ char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
+ 		break;
+ 	}
+ 
+-	if (spec.field_width > 0)
+-		len = min_t(int, spec.field_width, 64);
+-
+-	for (i = 0; i < len; ++i) {
+-		if (buf < end)
+-			*buf = hex_asc_hi(addr[i]);
+-		++buf;
+-		if (buf < end)
+-			*buf = hex_asc_lo(addr[i]);
+-		++buf;
+-
+-		if (separator && i != len - 1) {
+-			if (buf < end)
+-				*buf = separator;
+-			++buf;
+-		}
+-	}
+-
+-	return buf;
++	prt_hex_bytes(out, addr, len, 1, separator);
+ }
+ 
+ static noinline_for_stack
+-char *bitmap_string(char *buf, char *end, unsigned long *bitmap,
+-		    struct printf_spec spec, const char *fmt)
++void bitmap_string(struct printbuf *out, unsigned long *bitmap, int nr_bits)
+ {
++	struct printf_spec spec = { .flags = SMALL | ZEROPAD, .base = 16 };
+ 	const int CHUNKSZ = 32;
+-	int nr_bits = max_t(int, spec.field_width, 0);
+ 	int i, chunksz;
+ 	bool first = true;
+ 
+-	if (check_pointer(&buf, end, bitmap, spec))
+-		return buf;
++	nr_bits = max(nr_bits, 0);
+ 
+-	/* reused to print numbers */
+-	spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 };
++	if (check_pointer(out, bitmap))
++		return;
+ 
+ 	chunksz = nr_bits & (CHUNKSZ - 1);
+ 	if (chunksz == 0)
+@@ -1217,63 +1192,53 @@ char *bitmap_string(char *buf, char *end, unsigned long *bitmap,
+ 		bit = i % BITS_PER_LONG;
+ 		val = (bitmap[word] >> bit) & chunkmask;
+ 
+-		if (!first) {
+-			if (buf < end)
+-				*buf = ',';
+-			buf++;
+-		}
++		if (!first)
++			prt_char(out, ',');
+ 		first = false;
+ 
+ 		spec.field_width = DIV_ROUND_UP(chunksz, 4);
+-		buf = number(buf, end, val, spec);
++		number(out, val, spec);
+ 
+ 		chunksz = CHUNKSZ;
+ 	}
+-	return buf;
+ }
+ 
+ static noinline_for_stack
+-char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap,
+-			 struct printf_spec spec, const char *fmt)
++void bitmap_list_string(struct printbuf *out, unsigned long *bitmap,
++			int nr_bits)
+ {
+-	int nr_bits = max_t(int, spec.field_width, 0);
+ 	bool first = true;
+ 	int rbot, rtop;
+ 
+-	if (check_pointer(&buf, end, bitmap, spec))
+-		return buf;
++	nr_bits = max(nr_bits, 0);
++
++	if (check_pointer(out, bitmap))
++		return ;
+ 
+ 	for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) {
+-		if (!first) {
+-			if (buf < end)
+-				*buf = ',';
+-			buf++;
+-		}
++		if (!first)
++			prt_char(out, ',');
+ 		first = false;
+ 
+-		buf = number(buf, end, rbot, default_dec_spec);
++		prt_u64(out, rbot);
+ 		if (rtop == rbot + 1)
+ 			continue;
+ 
+-		if (buf < end)
+-			*buf = '-';
+-		buf = number(++buf, end, rtop - 1, default_dec_spec);
++		prt_char(out, '-');
++		prt_u64(out, rtop - 1);
+ 	}
+-	return buf;
+ }
+ 
+ static noinline_for_stack
+-char *mac_address_string(char *buf, char *end, u8 *addr,
+-			 struct printf_spec spec, const char *fmt)
++void mac_address_string(struct printbuf *out, u8 *addr,
++			const char *fmt)
+ {
+-	char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")];
+-	char *p = mac_addr;
+ 	int i;
+ 	char separator;
+ 	bool reversed = false;
+ 
+-	if (check_pointer(&buf, end, addr, spec))
+-		return buf;
++	if (check_pointer(out, addr))
++		return;
+ 
+ 	switch (fmt[1]) {
+ 	case 'F':
+@@ -1291,25 +1256,23 @@ char *mac_address_string(char *buf, char *end, u8 *addr,
+ 
+ 	for (i = 0; i < 6; i++) {
+ 		if (reversed)
+-			p = hex_byte_pack(p, addr[5 - i]);
++			prt_hex_byte(out, addr[5 - i]);
+ 		else
+-			p = hex_byte_pack(p, addr[i]);
++			prt_hex_byte(out, addr[i]);
+ 
+ 		if (fmt[0] == 'M' && i != 5)
+-			*p++ = separator;
++			prt_char(out, separator);
+ 	}
+-	*p = '\0';
+-
+-	return string_nocheck(buf, end, mac_addr, spec);
+ }
+ 
+ static noinline_for_stack
+-char *ip4_string(char *p, const u8 *addr, const char *fmt)
++void ip4_string(struct printbuf *out, const u8 *addr, const char *fmt)
+ {
+-	int i;
+-	bool leading_zeros = (fmt[0] == 'i');
+-	int index;
+-	int step;
++	struct printf_spec spec = default_dec_spec;
++	int i, index, step;
++
++	if (fmt[0] == 'i')
++		spec.precision = 3;
+ 
+ 	switch (fmt[2]) {
+ 	case 'h':
+@@ -1333,28 +1296,15 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
+ 		break;
+ 	}
+ 	for (i = 0; i < 4; i++) {
+-		char temp[4] __aligned(2);	/* hold each IP quad in reverse order */
+-		int digits = put_dec_trunc8(temp, addr[index]) - temp;
+-		if (leading_zeros) {
+-			if (digits < 3)
+-				*p++ = '0';
+-			if (digits < 2)
+-				*p++ = '0';
+-		}
+-		/* reverse the digits in the quad */
+-		while (digits--)
+-			*p++ = temp[digits];
+-		if (i < 3)
+-			*p++ = '.';
++		if (i)
++			prt_char(out, '.');
++		number(out, addr[index], spec);
+ 		index += step;
+ 	}
+-	*p = '\0';
+-
+-	return p;
+ }
+ 
+ static noinline_for_stack
+-char *ip6_compressed_string(char *p, const char *addr)
++void ip6_compressed_string(struct printbuf *out, const char *addr)
+ {
+ 	int i, j, range;
+ 	unsigned char zerolength[8];
+@@ -1398,14 +1348,14 @@ char *ip6_compressed_string(char *p, const char *addr)
+ 	for (i = 0; i < range; i++) {
+ 		if (i == colonpos) {
+ 			if (needcolon || i == 0)
+-				*p++ = ':';
+-			*p++ = ':';
++				__prt_char(out, ':');
++			__prt_char(out, ':');
+ 			needcolon = false;
+ 			i += longest - 1;
+ 			continue;
+ 		}
+ 		if (needcolon) {
+-			*p++ = ':';
++			__prt_char(out, ':');
+ 			needcolon = false;
+ 		}
+ 		/* hex u16 without leading 0s */
+@@ -1414,81 +1364,56 @@ char *ip6_compressed_string(char *p, const char *addr)
+ 		lo = word & 0xff;
+ 		if (hi) {
+ 			if (hi > 0x0f)
+-				p = hex_byte_pack(p, hi);
++				prt_hex_byte(out, hi);
+ 			else
+-				*p++ = hex_asc_lo(hi);
+-			p = hex_byte_pack(p, lo);
++				__prt_char(out, hex_asc_lo(hi));
++			prt_hex_byte(out, lo);
+ 		}
+ 		else if (lo > 0x0f)
+-			p = hex_byte_pack(p, lo);
++			prt_hex_byte(out, lo);
+ 		else
+-			*p++ = hex_asc_lo(lo);
++			__prt_char(out, hex_asc_lo(lo));
+ 		needcolon = true;
+ 	}
+ 
+ 	if (useIPv4) {
+ 		if (needcolon)
+-			*p++ = ':';
+-		p = ip4_string(p, &in6.s6_addr[12], "I4");
++			__prt_char(out, ':');
++		ip4_string(out, &in6.s6_addr[12], "I4");
+ 	}
+-	*p = '\0';
+-
+-	return p;
+ }
+ 
+ static noinline_for_stack
+-char *ip6_string(char *p, const char *addr, const char *fmt)
++void ip6_string(struct printbuf *out, const char *addr, const char *fmt)
+ {
+ 	int i;
+ 
+ 	for (i = 0; i < 8; i++) {
+-		p = hex_byte_pack(p, *addr++);
+-		p = hex_byte_pack(p, *addr++);
++		prt_hex_byte(out, *addr++);
++		prt_hex_byte(out, *addr++);
+ 		if (fmt[0] == 'I' && i != 7)
+-			*p++ = ':';
++			prt_char(out, ':');
+ 	}
+-	*p = '\0';
+-
+-	return p;
+ }
+ 
+ static noinline_for_stack
+-char *ip6_addr_string(char *buf, char *end, const u8 *addr,
+-		      struct printf_spec spec, const char *fmt)
++void ip6_addr_string(struct printbuf *out, const u8 *addr,
++		     const char *fmt)
+ {
+-	char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];
+-
+ 	if (fmt[0] == 'I' && fmt[2] == 'c')
+-		ip6_compressed_string(ip6_addr, addr);
++		ip6_compressed_string(out, addr);
+ 	else
+-		ip6_string(ip6_addr, addr, fmt);
+-
+-	return string_nocheck(buf, end, ip6_addr, spec);
+-}
+-
+-static noinline_for_stack
+-char *ip4_addr_string(char *buf, char *end, const u8 *addr,
+-		      struct printf_spec spec, const char *fmt)
+-{
+-	char ip4_addr[sizeof("255.255.255.255")];
+-
+-	ip4_string(ip4_addr, addr, fmt);
+-
+-	return string_nocheck(buf, end, ip4_addr, spec);
++		ip6_string(out, addr, fmt);
+ }
+ 
+ static noinline_for_stack
+-char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
+-			 struct printf_spec spec, const char *fmt)
++void ip6_addr_string_sa(struct printbuf *out,
++			const struct sockaddr_in6 *sa,
++			const char *fmt)
+ {
+ 	bool have_p = false, have_s = false, have_f = false, have_c = false;
+-	char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") +
+-		      sizeof(":12345") + sizeof("/123456789") +
+-		      sizeof("%1234567890")];
+-	char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr);
+ 	const u8 *addr = (const u8 *) &sa->sin6_addr;
+ 	char fmt6[2] = { fmt[0], '6' };
+-	u8 off = 0;
+ 
+ 	fmt++;
+ 	while (isalpha(*++fmt)) {
+@@ -1508,44 +1433,36 @@ char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
+ 		}
+ 	}
+ 
+-	if (have_p || have_s || have_f) {
+-		*p = '[';
+-		off = 1;
+-	}
++	if (have_p || have_s || have_f)
++		prt_char(out, '[');
+ 
+ 	if (fmt6[0] == 'I' && have_c)
+-		p = ip6_compressed_string(ip6_addr + off, addr);
++		ip6_compressed_string(out, addr);
+ 	else
+-		p = ip6_string(ip6_addr + off, addr, fmt6);
++		ip6_string(out, addr, fmt6);
+ 
+ 	if (have_p || have_s || have_f)
+-		*p++ = ']';
++		prt_char(out, ']');
+ 
+ 	if (have_p) {
+-		*p++ = ':';
+-		p = number(p, pend, ntohs(sa->sin6_port), spec);
++		prt_char(out, ':');
++		prt_u64(out, ntohs(sa->sin6_port));
+ 	}
+ 	if (have_f) {
+-		*p++ = '/';
+-		p = number(p, pend, ntohl(sa->sin6_flowinfo &
+-					  IPV6_FLOWINFO_MASK), spec);
++		prt_char(out, '/');
++		prt_u64(out, ntohl(sa->sin6_flowinfo & IPV6_FLOWINFO_MASK));
+ 	}
+ 	if (have_s) {
+-		*p++ = '%';
+-		p = number(p, pend, sa->sin6_scope_id, spec);
++		prt_char(out, '%');
++		prt_u64(out, sa->sin6_scope_id);
+ 	}
+-	*p = '\0';
+-
+-	return string_nocheck(buf, end, ip6_addr, spec);
+ }
+ 
+ static noinline_for_stack
+-char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
+-			 struct printf_spec spec, const char *fmt)
++void ip4_addr_string_sa(struct printbuf *out, const struct sockaddr_in *sa,
++			const char *fmt)
+ {
+ 	bool have_p = false;
+-	char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")];
+-	char *pend = ip4_addr + sizeof(ip4_addr);
+ 	const u8 *addr = (const u8 *) &sa->sin_addr.s_addr;
+ 	char fmt4[3] = { fmt[0], '4', 0 };
+ 
+@@ -1564,30 +1481,27 @@ char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
+ 		}
+ 	}
+ 
+-	p = ip4_string(ip4_addr, addr, fmt4);
++	ip4_string(out, addr, fmt4);
+ 	if (have_p) {
+-		*p++ = ':';
+-		p = number(p, pend, ntohs(sa->sin_port), spec);
++		prt_char(out, ':');
++		prt_u64(out, ntohs(sa->sin_port));
+ 	}
+-	*p = '\0';
+-
+-	return string_nocheck(buf, end, ip4_addr, spec);
+ }
+ 
+ static noinline_for_stack
+-char *ip_addr_string(char *buf, char *end, const void *ptr,
+-		     struct printf_spec spec, const char *fmt)
++void ip_addr_string(struct printbuf *out, const void *ptr,
++		    const char *fmt)
+ {
+ 	char *err_fmt_msg;
+ 
+-	if (check_pointer(&buf, end, ptr, spec))
+-		return buf;
++	if (check_pointer(out, ptr))
++		return;
+ 
+ 	switch (fmt[1]) {
+ 	case '6':
+-		return ip6_addr_string(buf, end, ptr, spec, fmt);
++		return ip6_addr_string(out, ptr, fmt);
+ 	case '4':
+-		return ip4_addr_string(buf, end, ptr, spec, fmt);
++		return ip4_string(out, ptr, fmt);
+ 	case 'S': {
+ 		const union {
+ 			struct sockaddr		raw;
+@@ -1597,21 +1511,21 @@ char *ip_addr_string(char *buf, char *end, const void *ptr,
+ 
+ 		switch (sa->raw.sa_family) {
+ 		case AF_INET:
+-			return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt);
++			return ip4_addr_string_sa(out, &sa->v4, fmt);
+ 		case AF_INET6:
+-			return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt);
++			return ip6_addr_string_sa(out, &sa->v6, fmt);
+ 		default:
+-			return error_string(buf, end, "(einval)", spec);
++			return error_string(out, "(einval)");
+ 		}}
+ 	}
+ 
+ 	err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)";
+-	return error_string(buf, end, err_fmt_msg, spec);
++	error_string(out, err_fmt_msg);
+ }
+ 
+ static noinline_for_stack
+-char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
+-		     const char *fmt)
++void escaped_string(struct printbuf *out, u8 *addr,
++		    struct printf_spec spec, const char *fmt)
+ {
+ 	bool found = true;
+ 	int count = 1;
+@@ -1619,10 +1533,10 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
+ 	int len;
+ 
+ 	if (spec.field_width == 0)
+-		return buf;				/* nothing to print */
++		return;				/* nothing to print */
+ 
+-	if (check_pointer(&buf, end, addr, spec))
+-		return buf;
++	if (check_pointer_spec(out, addr, spec))
++		return;
+ 
+ 	do {
+ 		switch (fmt[count++]) {
+@@ -1657,44 +1571,32 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
+ 		flags = ESCAPE_ANY_NP;
+ 
+ 	len = spec.field_width < 0 ? 1 : spec.field_width;
+-
+-	/*
+-	 * string_escape_mem() writes as many characters as it can to
+-	 * the given buffer, and returns the total size of the output
+-	 * had the buffer been big enough.
+-	 */
+-	buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);
+-
+-	return buf;
++	prt_escaped_string(out, addr, len, flags, NULL);
+ }
+ 
+-static char *va_format(char *buf, char *end, struct va_format *va_fmt,
+-		       struct printf_spec spec, const char *fmt)
++static void va_format(struct printbuf *out,
++		      struct va_format *va_fmt,
++		      struct printf_spec spec, const char *fmt)
+ {
+ 	va_list va;
+ 
+-	if (check_pointer(&buf, end, va_fmt, spec))
+-		return buf;
++	if (check_pointer_spec(out, va_fmt, spec))
++		return;
+ 
+ 	va_copy(va, *va_fmt->va);
+-	buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
++	prt_vprintf(out, va_fmt->fmt, va);
+ 	va_end(va);
+-
+-	return buf;
+ }
+ 
+ static noinline_for_stack
+-char *uuid_string(char *buf, char *end, const u8 *addr,
+-		  struct printf_spec spec, const char *fmt)
++void uuid_string(struct printbuf *out, const u8 *addr, const char *fmt)
+ {
+-	char uuid[UUID_STRING_LEN + 1];
+-	char *p = uuid;
+ 	int i;
+ 	const u8 *index = uuid_index;
+ 	bool uc = false;
+ 
+-	if (check_pointer(&buf, end, addr, spec))
+-		return buf;
++	if (check_pointer(out, addr))
++		return;
+ 
+ 	switch (*(++fmt)) {
+ 	case 'L':
+@@ -1710,60 +1612,54 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
+ 
+ 	for (i = 0; i < 16; i++) {
+ 		if (uc)
+-			p = hex_byte_pack_upper(p, addr[index[i]]);
++			prt_hex_byte_upper(out, addr[index[i]]);
+ 		else
+-			p = hex_byte_pack(p, addr[index[i]]);
++			prt_hex_byte(out, addr[index[i]]);
+ 		switch (i) {
+ 		case 3:
+ 		case 5:
+ 		case 7:
+ 		case 9:
+-			*p++ = '-';
++			prt_char(out, '-');
+ 			break;
+ 		}
+ 	}
+-
+-	*p = 0;
+-
+-	return string_nocheck(buf, end, uuid, spec);
+ }
+ 
+ static noinline_for_stack
+-char *netdev_bits(char *buf, char *end, const void *addr,
+-		  struct printf_spec spec,  const char *fmt)
++void netdev_bits(struct printbuf *out, const void *addr,
++		 const char *fmt)
+ {
+ 	unsigned long long num;
+ 	int size;
+ 
+-	if (check_pointer(&buf, end, addr, spec))
+-		return buf;
++	if (check_pointer(out, addr))
++		return;
+ 
+ 	switch (fmt[1]) {
+ 	case 'F':
+ 		num = *(const netdev_features_t *)addr;
+ 		size = sizeof(netdev_features_t);
++		special_hex_number(out, num, size);
+ 		break;
+ 	default:
+-		return error_string(buf, end, "(%pN?)", spec);
++		error_string(out, "(%pN?)");
++		break;
+ 	}
+-
+-	return special_hex_number(buf, end, num, size);
+ }
+ 
+ static noinline_for_stack
+-char *fourcc_string(char *buf, char *end, const u32 *fourcc,
+-		    struct printf_spec spec, const char *fmt)
++void fourcc_string(struct printbuf *out, const u32 *fourcc,
++		   const char *fmt)
+ {
+-	char output[sizeof("0123 little-endian (0x01234567)")];
+-	char *p = output;
+ 	unsigned int i;
+ 	u32 orig, val;
+ 
+ 	if (fmt[1] != 'c' || fmt[2] != 'c')
+-		return error_string(buf, end, "(%p4?)", spec);
++		return error_string(out, "(%p4?)");
+ 
+-	if (check_pointer(&buf, end, fourcc, spec))
+-		return buf;
++	if (check_pointer(out, fourcc))
++		return;
+ 
+ 	orig = get_unaligned(fourcc);
+ 	val = orig & ~BIT(31);
+@@ -1772,31 +1668,27 @@ char *fourcc_string(char *buf, char *end, const u32 *fourcc,
+ 		unsigned char c = val >> (i * 8);
+ 
+ 		/* Print non-control ASCII characters as-is, dot otherwise */
+-		*p++ = isascii(c) && isprint(c) ? c : '.';
++		prt_char(out, isascii(c) && isprint(c) ? c : '.');
+ 	}
+ 
+-	*p++ = ' ';
+-	strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian");
+-	p += strlen(p);
+-
+-	*p++ = ' ';
+-	*p++ = '(';
+-	p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32));
+-	*p++ = ')';
+-	*p = '\0';
++	prt_char(out, ' ');
++	prt_str(out, orig & BIT(31) ? "big-endian" : "little-endian");
+ 
+-	return string(buf, end, output, spec);
++	prt_char(out, ' ');
++	prt_char(out, '(');
++	special_hex_number(out, orig, sizeof(u32));
++	prt_char(out, ')');
+ }
+ 
+ static noinline_for_stack
+-char *address_val(char *buf, char *end, const void *addr,
+-		  struct printf_spec spec, const char *fmt)
++void address_val(struct printbuf *out, const void *addr,
++		 const char *fmt)
+ {
+ 	unsigned long long num;
+ 	int size;
+ 
+-	if (check_pointer(&buf, end, addr, spec))
+-		return buf;
++	if (check_pointer(out, addr))
++		return;
+ 
+ 	switch (fmt[1]) {
+ 	case 'd':
+@@ -1810,55 +1702,44 @@ char *address_val(char *buf, char *end, const void *addr,
+ 		break;
+ 	}
+ 
+-	return special_hex_number(buf, end, num, size);
++	special_hex_number(out, num, size);
+ }
+ 
+ static noinline_for_stack
+-char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r)
++void date_str(struct printbuf *out,
++	      const struct rtc_time *tm, bool r)
+ {
+ 	int year = tm->tm_year + (r ? 0 : 1900);
+ 	int mon = tm->tm_mon + (r ? 0 : 1);
+ 
+-	buf = number(buf, end, year, default_dec04_spec);
+-	if (buf < end)
+-		*buf = '-';
+-	buf++;
+-
+-	buf = number(buf, end, mon, default_dec02_spec);
+-	if (buf < end)
+-		*buf = '-';
+-	buf++;
+-
+-	return number(buf, end, tm->tm_mday, default_dec02_spec);
++	prt_u64_minwidth(out, year, 4);
++	prt_char(out, '-');
++	prt_u64_minwidth(out, mon, 2);
++	prt_char(out, '-');
++	prt_u64_minwidth(out, tm->tm_mday, 2);
+ }
+ 
+ static noinline_for_stack
+-char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r)
++void time_str(struct printbuf *out, const struct rtc_time *tm, bool r)
+ {
+-	buf = number(buf, end, tm->tm_hour, default_dec02_spec);
+-	if (buf < end)
+-		*buf = ':';
+-	buf++;
+-
+-	buf = number(buf, end, tm->tm_min, default_dec02_spec);
+-	if (buf < end)
+-		*buf = ':';
+-	buf++;
+-
+-	return number(buf, end, tm->tm_sec, default_dec02_spec);
++	prt_u64_minwidth(out, tm->tm_hour, 2);
++	prt_char(out, ':');
++	prt_u64_minwidth(out, tm->tm_min, 2);
++	prt_char(out, ':');
++	prt_u64_minwidth(out, tm->tm_sec, 2);
+ }
+ 
+ static noinline_for_stack
+-char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
+-	      struct printf_spec spec, const char *fmt)
++void rtc_str(struct printbuf *out, const struct rtc_time *tm,
++	     const char *fmt)
+ {
+ 	bool have_t = true, have_d = true;
+ 	bool raw = false, iso8601_separator = true;
+ 	bool found = true;
+ 	int count = 2;
+ 
+-	if (check_pointer(&buf, end, tm, spec))
+-		return buf;
++	if (check_pointer(out, tm))
++		return;
+ 
+ 	switch (fmt[count]) {
+ 	case 'd':
+@@ -1886,21 +1767,16 @@ char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
+ 	} while (found);
+ 
+ 	if (have_d)
+-		buf = date_str(buf, end, tm, raw);
+-	if (have_d && have_t) {
+-		if (buf < end)
+-			*buf = iso8601_separator ? 'T' : ' ';
+-		buf++;
+-	}
++		date_str(out, tm, raw);
++	if (have_d && have_t)
++		prt_char(out, iso8601_separator ? 'T' : ' ');
+ 	if (have_t)
+-		buf = time_str(buf, end, tm, raw);
+-
+-	return buf;
++		time_str(out, tm, raw);
+ }
+ 
+ static noinline_for_stack
+-char *time64_str(char *buf, char *end, const time64_t time,
+-		 struct printf_spec spec, const char *fmt)
++void time64_str(struct printbuf *out, const time64_t time,
++		const char *fmt)
+ {
+ 	struct rtc_time rtc_time;
+ 	struct tm tm;
+@@ -1918,47 +1794,47 @@ char *time64_str(char *buf, char *end, const time64_t time,
+ 
+ 	rtc_time.tm_isdst = 0;
+ 
+-	return rtc_str(buf, end, &rtc_time, spec, fmt);
++	rtc_str(out, &rtc_time, fmt);
+ }
+ 
+ static noinline_for_stack
+-char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec,
+-		    const char *fmt)
++void time_and_date(struct printbuf *out, void *ptr,
++		   const char *fmt)
+ {
+ 	switch (fmt[1]) {
+ 	case 'R':
+-		return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt);
++		return rtc_str(out, (const struct rtc_time *)ptr, fmt);
+ 	case 'T':
+-		return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt);
++		return time64_str(out, *(const time64_t *)ptr, fmt);
+ 	default:
+-		return error_string(buf, end, "(%pt?)", spec);
++		return error_string(out, "(%pt?)");
+ 	}
+ }
+ 
+ static noinline_for_stack
+-char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
+-	    const char *fmt)
++void clock(struct printbuf *out, struct clk *clk,
++	   struct printf_spec spec, const char *fmt)
+ {
+ 	if (!IS_ENABLED(CONFIG_HAVE_CLK))
+-		return error_string(buf, end, "(%pC?)", spec);
++		return error_string_spec(out, "(%pC?)", spec);
+ 
+-	if (check_pointer(&buf, end, clk, spec))
+-		return buf;
++	if (check_pointer_spec(out, clk, spec))
++		return;
+ 
+ 	switch (fmt[1]) {
+ 	case 'n':
+ 	default:
+ #ifdef CONFIG_COMMON_CLK
+-		return string(buf, end, __clk_get_name(clk), spec);
++		return string_spec(out, __clk_get_name(clk), spec);
+ #else
+-		return ptr_to_id(buf, end, clk, spec);
++		return ptr_to_id(out, clk, spec);
+ #endif
+ 	}
+ }
+ 
+ static
+-char *format_flags(char *buf, char *end, unsigned long flags,
+-					const struct trace_print_flags *names)
++void format_flags(struct printbuf *out, unsigned long flags,
++		  const struct trace_print_flags *names)
+ {
+ 	unsigned long mask;
+ 
+@@ -1967,20 +1843,15 @@ char *format_flags(char *buf, char *end, unsigned long flags,
+ 		if ((flags & mask) != mask)
+ 			continue;
+ 
+-		buf = string(buf, end, names->name, default_str_spec);
++		string(out, names->name);
+ 
+ 		flags &= ~mask;
+-		if (flags) {
+-			if (buf < end)
+-				*buf = '|';
+-			buf++;
+-		}
++		if (flags)
++			prt_char(out, '|');
+ 	}
+ 
+ 	if (flags)
+-		buf = number(buf, end, flags, default_flag_spec);
+-
+-	return buf;
++		number(out, flags, default_flag_spec);
+ }
+ 
+ struct page_flags_fields {
+@@ -2005,20 +1876,18 @@ static const struct page_flags_fields pff[] = {
+ };
+ 
+ static
+-char *format_page_flags(char *buf, char *end, unsigned long flags)
++void format_page_flags(struct printbuf *out, unsigned long flags)
+ {
+ 	unsigned long main_flags = flags & PAGEFLAGS_MASK;
+ 	bool append = false;
+ 	int i;
+ 
+-	buf = number(buf, end, flags, default_flag_spec);
+-	if (buf < end)
+-		*buf = '(';
+-	buf++;
++	number(out, flags, default_flag_spec);
++	prt_char(out, '(');
+ 
+ 	/* Page flags from the main area. */
+ 	if (main_flags) {
+-		buf = format_flags(buf, end, main_flags, pageflag_names);
++		format_flags(out, main_flags, pageflag_names);
+ 		append = true;
+ 	}
+ 
+@@ -2029,41 +1898,31 @@ char *format_page_flags(char *buf, char *end, unsigned long flags)
+ 			continue;
+ 
+ 		/* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */
+-		if (append) {
+-			if (buf < end)
+-				*buf = '|';
+-			buf++;
+-		}
++		if (append)
++			prt_char(out, '|');
+ 
+-		buf = string(buf, end, pff[i].name, default_str_spec);
+-		if (buf < end)
+-			*buf = '=';
+-		buf++;
+-		buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask,
+-			     *pff[i].spec);
++		string(out, pff[i].name);
++		prt_char(out, '=');
++		number(out, (flags >> pff[i].shift) & pff[i].mask, *pff[i].spec);
+ 
+ 		append = true;
+ 	}
+-	if (buf < end)
+-		*buf = ')';
+-	buf++;
+-
+-	return buf;
++	prt_char(out, ')');
+ }
+ 
+ static noinline_for_stack
+-char *flags_string(char *buf, char *end, void *flags_ptr,
+-		   struct printf_spec spec, const char *fmt)
++void flags_string(struct printbuf *out, void *flags_ptr,
++		  const char *fmt)
+ {
+ 	unsigned long flags;
+ 	const struct trace_print_flags *names;
+ 
+-	if (check_pointer(&buf, end, flags_ptr, spec))
+-		return buf;
++	if (check_pointer(out, flags_ptr))
++		return;
+ 
+ 	switch (fmt[1]) {
+ 	case 'p':
+-		return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
++		return format_page_flags(out, *(unsigned long *)flags_ptr);
+ 	case 'v':
+ 		flags = *(unsigned long *)flags_ptr;
+ 		names = vmaflag_names;
+@@ -2073,15 +1932,15 @@ char *flags_string(char *buf, char *end, void *flags_ptr,
+ 		names = gfpflag_names;
+ 		break;
+ 	default:
+-		return error_string(buf, end, "(%pG?)", spec);
++		return error_string(out, "(%pG?)");
+ 	}
+ 
+-	return format_flags(buf, end, flags, names);
++	return format_flags(out, flags, names);
+ }
+ 
+ static noinline_for_stack
+-char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
+-			      char *end)
++void fwnode_full_name_string(struct printbuf *out,
++			     struct fwnode_handle *fwnode)
+ {
+ 	int depth;
+ 
+@@ -2090,39 +1949,30 @@ char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
+ 		struct fwnode_handle *__fwnode =
+ 			fwnode_get_nth_parent(fwnode, depth);
+ 
+-		buf = string(buf, end, fwnode_get_name_prefix(__fwnode),
+-			     default_str_spec);
+-		buf = string(buf, end, fwnode_get_name(__fwnode),
+-			     default_str_spec);
++		string(out, fwnode_get_name_prefix(__fwnode));
++		string(out, fwnode_get_name(__fwnode));
+ 
+ 		fwnode_handle_put(__fwnode);
+ 	}
+-
+-	return buf;
+ }
+ 
+ static noinline_for_stack
+-char *device_node_string(char *buf, char *end, struct device_node *dn,
+-			 struct printf_spec spec, const char *fmt)
++void device_node_string(struct printbuf *out, struct device_node *dn,
++			const char *fmt)
+ {
+-	char tbuf[sizeof("xxxx") + 1];
+ 	const char *p;
+ 	int ret;
+-	char *buf_start = buf;
+ 	struct property *prop;
+ 	bool has_mult, pass;
+ 
+-	struct printf_spec str_spec = spec;
+-	str_spec.field_width = -1;
+-
+ 	if (fmt[0] != 'F')
+-		return error_string(buf, end, "(%pO?)", spec);
++		return error_string(out, "(%pO?)");
+ 
+ 	if (!IS_ENABLED(CONFIG_OF))
+-		return error_string(buf, end, "(%pOF?)", spec);
++		return error_string(out, "(%pOF?)");
+ 
+-	if (check_pointer(&buf, end, dn, spec))
+-		return buf;
++	if (check_pointer(out, dn))
++		return;
+ 
+ 	/* simple case without anything any more format specifiers */
+ 	fmt++;
+@@ -2130,55 +1980,48 @@ char *device_node_string(char *buf, char *end, struct device_node *dn,
+ 		fmt = "f";
+ 
+ 	for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) {
+-		int precision;
+-		if (pass) {
+-			if (buf < end)
+-				*buf = ':';
+-			buf++;
+-		}
++		if (pass)
++			prt_char(out, ':');
+ 
+ 		switch (*fmt) {
+ 		case 'f':	/* full_name */
+-			buf = fwnode_full_name_string(of_fwnode_handle(dn), buf,
+-						      end);
++			fwnode_full_name_string(out, of_fwnode_handle(dn));
+ 			break;
+-		case 'n':	/* name */
+-			p = fwnode_get_name(of_fwnode_handle(dn));
+-			precision = str_spec.precision;
+-			str_spec.precision = strchrnul(p, '@') - p;
+-			buf = string(buf, end, p, str_spec);
+-			str_spec.precision = precision;
++		case 'n': {	/* name */
++			const char *name = fwnode_get_name(of_fwnode_handle(dn));
++			unsigned len = strchrnul(name, '@') - name;
++
++			prt_bytes(out, name, len);
+ 			break;
++		}
+ 		case 'p':	/* phandle */
+-			buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec);
++			prt_u64(out, dn->phandle);
+ 			break;
+ 		case 'P':	/* path-spec */
+ 			p = fwnode_get_name(of_fwnode_handle(dn));
+ 			if (!p[1])
+ 				p = "/";
+-			buf = string(buf, end, p, str_spec);
++			string(out, p);
+ 			break;
+ 		case 'F':	/* flags */
+-			tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-';
+-			tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-';
+-			tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-';
+-			tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-';
+-			tbuf[4] = 0;
+-			buf = string_nocheck(buf, end, tbuf, str_spec);
+-			break;
+-		case 'c':	/* major compatible string */
++			prt_char(out, of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-');
++			prt_char(out, of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-');
++			prt_char(out, of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-');
++			prt_char(out, of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-');
++			break;
++		case 'c':	/* major compatible string_spec */
+ 			ret = of_property_read_string(dn, "compatible", &p);
+ 			if (!ret)
+-				buf = string(buf, end, p, str_spec);
++				string(out, p);
+ 			break;
+-		case 'C':	/* full compatible string */
++		case 'C':	/* full compatible string_spec */
+ 			has_mult = false;
+ 			of_property_for_each_string(dn, "compatible", prop, p) {
+ 				if (has_mult)
+-					buf = string_nocheck(buf, end, ",", str_spec);
+-				buf = string_nocheck(buf, end, "\"", str_spec);
+-				buf = string(buf, end, p, str_spec);
+-				buf = string_nocheck(buf, end, "\"", str_spec);
++					prt_char(out, ',');
++				prt_char(out, '\"');
++				string(out, p);
++				prt_char(out, '\"');
+ 
+ 				has_mult = true;
+ 			}
+@@ -2187,38 +2030,30 @@ char *device_node_string(char *buf, char *end, struct device_node *dn,
+ 			break;
+ 		}
+ 	}
+-
+-	return widen_string(buf, buf - buf_start, end, spec);
+ }
+ 
+ static noinline_for_stack
+-char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
+-		    struct printf_spec spec, const char *fmt)
++void fwnode_string(struct printbuf *out,
++		   struct fwnode_handle *fwnode,
++		   const char *fmt)
+ {
+-	struct printf_spec str_spec = spec;
+-	char *buf_start = buf;
+-
+-	str_spec.field_width = -1;
+-
+ 	if (*fmt != 'w')
+-		return error_string(buf, end, "(%pf?)", spec);
++		return error_string(out, "(%pf?)");
+ 
+-	if (check_pointer(&buf, end, fwnode, spec))
+-		return buf;
++	if (check_pointer(out, fwnode))
++		return;
+ 
+ 	fmt++;
+ 
+ 	switch (*fmt) {
+ 	case 'P':	/* name */
+-		buf = string(buf, end, fwnode_get_name(fwnode), str_spec);
++		string(out, fwnode_get_name(fwnode));
+ 		break;
+ 	case 'f':	/* full_name */
+ 	default:
+-		buf = fwnode_full_name_string(fwnode, buf, end);
++		fwnode_full_name_string(out, fwnode);
+ 		break;
+ 	}
+-
+-	return widen_string(buf, buf - buf_start, end, spec);
+ }
+ 
+ int __init no_hash_pointers_enable(char *str)
+@@ -2374,33 +2209,40 @@ early_param("no_hash_pointers", no_hash_pointers_enable);
+  * rendering it useful as a unique identifier.
+  */
+ static noinline_for_stack
+-char *pointer(const char *fmt, char *buf, char *end, void *ptr,
+-	      struct printf_spec spec)
++void pointer(struct printbuf *out, const char *fmt,
++	     void *ptr, struct printf_spec spec)
+ {
++	unsigned prev_pos = out->pos;
++
+ 	switch (*fmt) {
+ 	case 'S':
+ 	case 's':
+ 		ptr = dereference_symbol_descriptor(ptr);
+ 		fallthrough;
+ 	case 'B':
+-		return symbol_string(buf, end, ptr, spec, fmt);
++		symbol_string(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'R':
+ 	case 'r':
+-		return resource_string(buf, end, ptr, spec, fmt);
++		resource_string(out, ptr, fmt[0] == 'R');
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'h':
+-		return hex_string(buf, end, ptr, spec, fmt);
++		/* Uses field_width but _not_ as field size */
++		return hex_string(out, ptr, spec.field_width, fmt);
+ 	case 'b':
++		/* Uses field_width but _not_ as field size */
+ 		switch (fmt[1]) {
+ 		case 'l':
+-			return bitmap_list_string(buf, end, ptr, spec, fmt);
++			return bitmap_list_string(out, ptr, spec.field_width);
+ 		default:
+-			return bitmap_string(buf, end, ptr, spec, fmt);
++			return bitmap_string(out, ptr, spec.field_width);
+ 		}
+ 	case 'M':			/* Colon separated: 00:01:02:03:04:05 */
+ 	case 'm':			/* Contiguous: 000102030405 */
+ 					/* [mM]F (FDDI) */
+ 					/* [mM]R (Reverse order; Bluetooth) */
+-		return mac_address_string(buf, end, ptr, spec, fmt);
++		mac_address_string(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'I':			/* Formatted IP supported
+ 					 * 4:	1.2.3.4
+ 					 * 6:	0001:0203:...:0708
+@@ -2410,57 +2252,69 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
+ 					 * 4:	001.002.003.004
+ 					 * 6:   000102...0f
+ 					 */
+-		return ip_addr_string(buf, end, ptr, spec, fmt);
++		ip_addr_string(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'E':
+-		return escaped_string(buf, end, ptr, spec, fmt);
++		return escaped_string(out, ptr, spec, fmt);
+ 	case 'U':
+-		return uuid_string(buf, end, ptr, spec, fmt);
++		uuid_string(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'V':
+-		return va_format(buf, end, ptr, spec, fmt);
++		return va_format(out, ptr, spec, fmt);
+ 	case 'K':
+-		return restricted_pointer(buf, end, ptr, spec);
++		return restricted_pointer(out, ptr, spec);
+ 	case 'N':
+-		return netdev_bits(buf, end, ptr, spec, fmt);
++		netdev_bits(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case '4':
+-		return fourcc_string(buf, end, ptr, spec, fmt);
++		fourcc_string(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'a':
+-		return address_val(buf, end, ptr, spec, fmt);
++		address_val(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'd':
+-		return dentry_name(buf, end, ptr, spec, fmt);
++		dentry_name(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 't':
+-		return time_and_date(buf, end, ptr, spec, fmt);
++		time_and_date(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'C':
+-		return clock(buf, end, ptr, spec, fmt);
++		return clock(out, ptr, spec, fmt);
+ 	case 'D':
+-		return file_dentry_name(buf, end, ptr, spec, fmt);
++		file_dentry_name(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ #ifdef CONFIG_BLOCK
+ 	case 'g':
+-		return bdev_name(buf, end, ptr, spec, fmt);
++		bdev_name(out, ptr);
++		return do_width_precision(out, prev_pos, spec);
+ #endif
+ 
+ 	case 'G':
+-		return flags_string(buf, end, ptr, spec, fmt);
++		flags_string(out, ptr, fmt);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'O':
+-		return device_node_string(buf, end, ptr, spec, fmt + 1);
++		device_node_string(out, ptr, fmt + 1);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'f':
+-		return fwnode_string(buf, end, ptr, spec, fmt + 1);
++		fwnode_string(out, ptr, fmt + 1);
++		return do_width_precision(out, prev_pos, spec);
+ 	case 'x':
+-		return pointer_string(buf, end, ptr, spec);
++		return pointer_string(out, ptr, spec);
+ 	case 'e':
+ 		/* %pe with a non-ERR_PTR gets treated as plain %p */
+ 		if (!IS_ERR(ptr))
+-			return default_pointer(buf, end, ptr, spec);
+-		return err_ptr(buf, end, ptr, spec);
++			return default_pointer(out, ptr, spec);
++		return err_ptr(out, ptr, spec);
+ 	case 'u':
+ 	case 'k':
+ 		switch (fmt[1]) {
+ 		case 's':
+-			return string(buf, end, ptr, spec);
++			return string_spec(out, ptr, spec);
+ 		default:
+-			return error_string(buf, end, "(einval)", spec);
++			return error_string_spec(out, "(einval)", spec);
+ 		}
+ 	default:
+-		return default_pointer(buf, end, ptr, spec);
++		return default_pointer(out, ptr, spec);
+ 	}
+ }
+ 
+@@ -2599,8 +2453,14 @@ int format_decode(const char *fmt, struct printf_spec *spec)
+ 		return ++fmt - start;
+ 
+ 	case 'p':
+-		spec->type = FORMAT_TYPE_PTR;
+-		return ++fmt - start;
++		fmt++;
++		if (fmt[0] == 'f' &&
++		    fmt[1] == '(') {
++			fmt += 2;
++			spec->type = FORMAT_TYPE_FN;
++		} else
++			spec->type = FORMAT_TYPE_PTR;
++		return fmt - start;
+ 
+ 	case '%':
+ 		spec->type = FORMAT_TYPE_PERCENT_CHAR;
+@@ -2681,53 +2541,89 @@ set_precision(struct printf_spec *spec, int prec)
+ 	}
+ }
+ 
++static void call_prt_fn(struct printbuf *out, struct call_pp *call_pp, void **fn_args, unsigned nr_args)
++{
++	typedef void (*printf_fn_0)(struct printbuf *);
++	typedef void (*printf_fn_1)(struct printbuf *, void *);
++	typedef void (*printf_fn_2)(struct printbuf *, void *, void *);
++	typedef void (*printf_fn_3)(struct printbuf *, void *, void *, void *);
++	typedef void (*printf_fn_4)(struct printbuf *, void *, void *, void *, void *);
++	typedef void (*printf_fn_5)(struct printbuf *, void *, void *, void *, void *, void *);
++	typedef void (*printf_fn_6)(struct printbuf *, void *, void *, void *, void *, void *, void *);
++	typedef void (*printf_fn_7)(struct printbuf *, void *, void *, void *, void *, void *, void *, void *);
++	typedef void (*printf_fn_8)(struct printbuf *, void *, void *, void *, void *, void *, void *, void *, void *);
++	void *fn;
++	unsigned i;
++
++	if (check_pointer(out, call_pp))
++		return;
++
++	if (call_pp->magic != CALL_PP_MAGIC) {
++		error_string(out, "bad pretty-printer magic");
++		return;
++	}
++
++	fn = call_pp->fn;
++	if (check_pointer(out, fn))
++		return;
++
++	for (i = 0; i < nr_args; i++)
++		if (check_pointer(out, fn_args[i]))
++			return;
++
++	switch (nr_args) {
++	case 0:
++		((printf_fn_0)fn)(out);
++		break;
++	case 1:
++		((printf_fn_1)fn)(out, fn_args[0]);
++		break;
++	case 2:
++		((printf_fn_2)fn)(out, fn_args[0], fn_args[1]);
++		break;
++	case 3:
++		((printf_fn_3)fn)(out, fn_args[0], fn_args[1], fn_args[2]);
++		break;
++	case 4:
++		((printf_fn_4)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3]);
++		break;
++	case 5:
++		((printf_fn_5)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3], fn_args[4]);
++		break;
++	case 6:
++		((printf_fn_6)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3], fn_args[4], fn_args[5]);
++		break;
++	case 7:
++		((printf_fn_7)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3], fn_args[4], fn_args[5], fn_args[6]);
++		break;
++	case 8:
++		((printf_fn_8)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3], fn_args[4], fn_args[5], fn_args[6], fn_args[7]);
++		break;
++	}
++}
++
+ /**
+- * vsnprintf - Format a string and place it in a buffer
+- * @buf: The buffer to place the result into
+- * @size: The size of the buffer, including the trailing null space
++ * prt_vprintf - Format a string, outputting to a printbuf
++ * @out: The printbuf to output to
+  * @fmt: The format string to use
+  * @args: Arguments for the format string
+  *
+- * This function generally follows C99 vsnprintf, but has some
+- * extensions and a few limitations:
++ * prt_vprintf works much like the traditional vsnprintf(), but outputs to a
++ * printbuf instead of raw pointer/size.
+  *
+- *  - ``%n`` is unsupported
+- *  - ``%p*`` is handled by pointer()
+- *
+- * See pointer() or Documentation/core-api/printk-formats.rst for more
+- * extensive description.
++ * If you're not already dealing with a va_list consider using prt_printf().
+  *
+- * **Please update the documentation in both places when making changes**
+- *
+- * The return value is the number of characters which would
+- * be generated for the given input, excluding the trailing
+- * '\0', as per ISO C99. If you want to have the exact
+- * number of characters written into @buf as return value
+- * (not including the trailing '\0'), use vscnprintf(). If the
+- * return is greater than or equal to @size, the resulting
+- * string is truncated.
+- *
+- * If you're not already dealing with a va_list consider using snprintf().
++ * See the vsnprintf() documentation for format string extensions over C99.
+  */
+-int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
++void prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+ {
+ 	unsigned long long num;
+-	char *str, *end;
+ 	struct printf_spec spec = {0};
+ 
+ 	/* Reject out-of-range values early.  Large positive sizes are
+ 	   used for unknown buffer sizes. */
+-	if (WARN_ON_ONCE(size > INT_MAX))
+-		return 0;
+-
+-	str = buf;
+-	end = buf + size;
+-
+-	/* Make sure end is always >= buf */
+-	if (end < buf) {
+-		end = ((void *)-1);
+-		size = end - buf;
+-	}
++	if (WARN_ON_ONCE(out->size > INT_MAX))
++		return;
+ 
+ 	while (*fmt) {
+ 		const char *old_fmt = fmt;
+@@ -2736,16 +2632,9 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+ 		fmt += read;
+ 
+ 		switch (spec.type) {
+-		case FORMAT_TYPE_NONE: {
+-			int copy = read;
+-			if (str < end) {
+-				if (copy > end - str)
+-					copy = end - str;
+-				memcpy(str, old_fmt, copy);
+-			}
+-			str += read;
++		case FORMAT_TYPE_NONE:
++			prt_bytes(out, old_fmt, read);
+ 			break;
+-		}
+ 
+ 		case FORMAT_TYPE_WIDTH:
+ 			set_field_width(&spec, va_arg(args, int));
+@@ -2755,44 +2644,60 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+ 			set_precision(&spec, va_arg(args, int));
+ 			break;
+ 
+-		case FORMAT_TYPE_CHAR: {
+-			char c;
++		case FORMAT_TYPE_CHAR:
++			if (spec.field_width > 0 && !(spec.flags & LEFT))
++				prt_chars(out, spec.field_width, ' ');
+ 
+-			if (!(spec.flags & LEFT)) {
+-				while (--spec.field_width > 0) {
+-					if (str < end)
+-						*str = ' ';
+-					++str;
++			__prt_char(out, (unsigned char) va_arg(args, int));
+ 
+-				}
+-			}
+-			c = (unsigned char) va_arg(args, int);
+-			if (str < end)
+-				*str = c;
+-			++str;
+-			while (--spec.field_width > 0) {
+-				if (str < end)
+-					*str = ' ';
+-				++str;
+-			}
++			if (spec.field_width > 0 && (spec.flags & LEFT))
++				prt_chars(out, spec.field_width, ' ');
++			spec.field_width = 0;
+ 			break;
+-		}
+ 
+ 		case FORMAT_TYPE_STR:
+-			str = string(str, end, va_arg(args, char *), spec);
++			/*
++			 * we can't use string() then do_width_precision
++			 * afterwards: people use the field width for passing
++			 * non nul terminated strings
++			 */
++			string_spec(out, va_arg(args, char *), spec);
+ 			break;
+ 
+ 		case FORMAT_TYPE_PTR:
+-			str = pointer(fmt, str, end, va_arg(args, void *),
+-				      spec);
++			pointer(out, fmt, va_arg(args, void *), spec);
+ 			while (isalnum(*fmt))
+ 				fmt++;
+ 			break;
+ 
++		case FORMAT_TYPE_FN: {
++			unsigned nr_args = 0;
++			void *fn_args[8];
++			void *fn = va_arg(args, void *);
++
++			while (*fmt != ')') {
++				if (nr_args) {
++					if (fmt[0] != ',')
++						goto out;
++					fmt++;
++				}
++
++				if (fmt[0] != '%' || fmt[1] != 'p')
++					goto out;
++				fmt += 2;
++
++				if (WARN_ON_ONCE(nr_args == ARRAY_SIZE(fn_args)))
++					goto out;
++				fn_args[nr_args++] = va_arg(args, void *);
++			}
++
++			call_prt_fn(out, fn, fn_args, nr_args);
++			fmt++; /* past trailing ) */
++			break;
++		}
++
+ 		case FORMAT_TYPE_PERCENT_CHAR:
+-			if (str < end)
+-				*str = '%';
+-			++str;
++			__prt_char(out, '%');
+ 			break;
+ 
+ 		case FORMAT_TYPE_INVALID:
+@@ -2845,21 +2750,70 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+ 				num = va_arg(args, unsigned int);
+ 			}
+ 
+-			str = number(str, end, num, spec);
++			number(out, num, spec);
+ 		}
+ 	}
+-
+ out:
+-	if (size > 0) {
+-		if (str < end)
+-			*str = '\0';
+-		else
+-			end[-1] = '\0';
+-	}
++	printbuf_nul_terminate(out);
++}
++EXPORT_SYMBOL(prt_vprintf);
+ 
+-	/* the trailing null byte doesn't count towards the total */
+-	return str-buf;
++/**
++ * prt_printf - Format a string, outputting to a printbuf
++ * @out: The printbuf to output to
++ * @fmt: The format string to use
++ * @args: Arguments for the format string
++ *
++ *
++ * prt_printf works much like the traditional sprintf(), but outputs to a
++ * printbuf instead of raw pointer/size.
++ *
++ * See the vsnprintf() documentation for format string extensions over C99.
++ */
++void prt_printf(struct printbuf *out, const char *fmt, ...)
++{
++	va_list args;
++
++	va_start(args, fmt);
++	prt_vprintf(out, fmt, args);
++	va_end(args);
++}
++EXPORT_SYMBOL(prt_printf);
++
++/**
++ * vsnprintf - Format a string and place it in a buffer
++ * @buf: The buffer to place the result into
++ * @size: The size of the buffer, including the trailing null space
++ * @fmt: The format string to use
++ * @args: Arguments for the format string
++ *
++ * This function generally follows C99 vsnprintf, but has some
++ * extensions and a few limitations:
++ *
++ *  - ``%n`` is unsupported
++ *  - ``%p*`` is handled by pointer()
++ *
++ * See pointer() or Documentation/core-api/printk-formats.rst for more
++ * extensive description.
++ *
++ * **Please update the documentation in both places when making changes**
++ *
++ * The return value is the number of characters which would
++ * be generated for the given input, excluding the trailing
++ * '\0', as per ISO C99. If you want to have the exact
++ * number of characters written into @buf as return value
++ * (not including the trailing '\0'), use vscnprintf(). If the
++ * return is greater than or equal to @size, the resulting
++ * string is truncated.
++ *
++ * If you're not already dealing with a va_list consider using snprintf().
++ */
++int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
++{
++	struct printbuf out = PRINTBUF_EXTERN(buf, size);
+ 
++	prt_vprintf(&out, fmt, args);
++	return out.pos;
+ }
+ EXPORT_SYMBOL(vsnprintf);
+ 
+@@ -2997,53 +2951,46 @@ EXPORT_SYMBOL(sprintf);
+  * bstr_printf() - Binary data to text string
+  */
+ 
++static inline void printbuf_align(struct printbuf *out, unsigned align)
++{
++	/* Assumes output buffer is correctly aligned: */
++	out->pos += align - 1;
++	out->pos &= ~(align - 1);
++}
++
+ /**
+- * vbin_printf - Parse a format string and place args' binary value in a buffer
+- * @bin_buf: The buffer to place args' binary value
+- * @size: The size of the buffer(by words(32bits), not characters)
++ * prt_vbinprintf - Parse a format string and place args' binary value in a buffer
++ * @out: The buffer to place args' binary value
+  * @fmt: The format string to use
+  * @args: Arguments for the format string
+  *
+  * The format follows C99 vsnprintf, except %n is ignored, and its argument
+  * is skipped.
+  *
+- * The return value is the number of words(32bits) which would be generated for
+- * the given input.
+- *
+  * NOTE:
+  * If the return value is greater than @size, the resulting bin_buf is NOT
+  * valid for bstr_printf().
+  */
+-int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
++void prt_vbinprintf(struct printbuf *out, const char *fmt, va_list args)
+ {
+ 	struct printf_spec spec = {0};
+-	char *str, *end;
+ 	int width;
+ 
+-	str = (char *)bin_buf;
+-	end = (char *)(bin_buf + size);
+-
+ #define save_arg(type)							\
+ ({									\
+ 	unsigned long long value;					\
+ 	if (sizeof(type) == 8) {					\
+-		unsigned long long val8;				\
+-		str = PTR_ALIGN(str, sizeof(u32));			\
+-		val8 = va_arg(args, unsigned long long);		\
+-		if (str + sizeof(type) <= end) {			\
+-			*(u32 *)str = *(u32 *)&val8;			\
+-			*(u32 *)(str + 4) = *((u32 *)&val8 + 1);	\
+-		}							\
++		u64 val8 = va_arg(args, u64);				\
++		printbuf_align(out, sizeof(u32));			\
++		prt_bytes(out, (u32 *) &val8, 4);			\
++		prt_bytes(out, ((u32 *) &val8) + 1, 4);			\
+ 		value = val8;						\
+ 	} else {							\
+-		unsigned int val4;					\
+-		str = PTR_ALIGN(str, sizeof(type));			\
+-		val4 = va_arg(args, int);				\
+-		if (str + sizeof(type) <= end)				\
+-			*(typeof(type) *)str = (type)(long)val4;	\
++		u32 val4 = va_arg(args, u32);				\
++		printbuf_align(out, sizeof(type));			\
++		prt_bytes(out, &val4, sizeof(type));			\
+ 		value = (unsigned long long)val4;			\
+ 	}								\
+-	str += sizeof(type);						\
+ 	value;								\
+ })
+ 
+@@ -3074,16 +3021,12 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
+ 		case FORMAT_TYPE_STR: {
+ 			const char *save_str = va_arg(args, char *);
+ 			const char *err_msg;
+-			size_t len;
+ 
+ 			err_msg = check_pointer_msg(save_str);
+ 			if (err_msg)
+ 				save_str = err_msg;
+ 
+-			len = strlen(save_str) + 1;
+-			if (str + len < end)
+-				memcpy(str, save_str, len);
+-			str += len;
++			prt_str(out, save_str);
+ 			break;
+ 		}
+ 
+@@ -3103,12 +3046,7 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
+ 					save_arg(void *);
+ 					break;
+ 				}
+-				str = pointer(fmt, str, end, va_arg(args, void *),
+-					      spec);
+-				if (str + 1 < end)
+-					*str++ = '\0';
+-				else
+-					end[-1] = '\0'; /* Must be nul terminated */
++				pointer(out, fmt, va_arg(args, void *), spec);
+ 			}
+ 			/* skip all alphanumeric pointer suffixes */
+ 			while (isalnum(*fmt))
+@@ -3146,15 +3084,15 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
+ 	}
+ 
+ out:
+-	return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
++	printbuf_nul_terminate(out);
++	printbuf_align(out, 4);
+ #undef save_arg
+ }
+-EXPORT_SYMBOL_GPL(vbin_printf);
++EXPORT_SYMBOL_GPL(prt_vbinprintf);
+ 
+ /**
+- * bstr_printf - Format a string from binary arguments and place it in a buffer
++ * prt_bstrprintf - Format a string from binary arguments and place it in a buffer
+  * @buf: The buffer to place the result into
+- * @size: The size of the buffer, including the trailing null space
+  * @fmt: The format string to use
+  * @bin_buf: Binary arguments for the format string
+  *
+@@ -3164,26 +3102,14 @@ EXPORT_SYMBOL_GPL(vbin_printf);
+  *
+  * The format follows C99 vsnprintf, but has some extensions:
+  *  see vsnprintf comment for details.
+- *
+- * The return value is the number of characters which would
+- * be generated for the given input, excluding the trailing
+- * '\0', as per ISO C99. If you want to have the exact
+- * number of characters written into @buf as return value
+- * (not including the trailing '\0'), use vscnprintf(). If the
+- * return is greater than or equal to @size, the resulting
+- * string is truncated.
+  */
+-int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
++void prt_bstrprintf(struct printbuf *out, const char *fmt, const u32 *bin_buf)
+ {
+ 	struct printf_spec spec = {0};
+-	char *str, *end;
+ 	const char *args = (const char *)bin_buf;
+ 
+-	if (WARN_ON_ONCE(size > INT_MAX))
+-		return 0;
+-
+-	str = buf;
+-	end = buf + size;
++	if (WARN_ON_ONCE(out->size > INT_MAX))
++		return;
+ 
+ #define get_arg(type)							\
+ ({									\
+@@ -3200,12 +3126,6 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+ 	value;								\
+ })
+ 
+-	/* Make sure end is always >= buf */
+-	if (end < buf) {
+-		end = ((void *)-1);
+-		size = end - buf;
+-	}
+-
+ 	while (*fmt) {
+ 		const char *old_fmt = fmt;
+ 		int read = format_decode(fmt, &spec);
+@@ -3213,16 +3133,9 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+ 		fmt += read;
+ 
+ 		switch (spec.type) {
+-		case FORMAT_TYPE_NONE: {
+-			int copy = read;
+-			if (str < end) {
+-				if (copy > end - str)
+-					copy = end - str;
+-				memcpy(str, old_fmt, copy);
+-			}
+-			str += read;
++		case FORMAT_TYPE_NONE:
++			prt_bytes(out, old_fmt, read);
+ 			break;
+-		}
+ 
+ 		case FORMAT_TYPE_WIDTH:
+ 			set_field_width(&spec, get_arg(int));
+@@ -3232,38 +3145,24 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+ 			set_precision(&spec, get_arg(int));
+ 			break;
+ 
+-		case FORMAT_TYPE_CHAR: {
+-			char c;
+-
+-			if (!(spec.flags & LEFT)) {
+-				while (--spec.field_width > 0) {
+-					if (str < end)
+-						*str = ' ';
+-					++str;
+-				}
+-			}
+-			c = (unsigned char) get_arg(char);
+-			if (str < end)
+-				*str = c;
+-			++str;
+-			while (--spec.field_width > 0) {
+-				if (str < end)
+-					*str = ' ';
+-				++str;
+-			}
++		case FORMAT_TYPE_CHAR:
++			if (!(spec.flags & LEFT))
++				prt_chars(out, spec.field_width, ' ');
++			__prt_char(out, (unsigned char) get_arg(char));
++			if ((spec.flags & LEFT))
++				prt_chars(out, spec.field_width, ' ');
+ 			break;
+-		}
+ 
+ 		case FORMAT_TYPE_STR: {
+ 			const char *str_arg = args;
+ 			args += strlen(str_arg) + 1;
+-			str = string(str, end, (char *)str_arg, spec);
++			string_spec(out, (char *)str_arg, spec);
+ 			break;
+ 		}
+ 
+ 		case FORMAT_TYPE_PTR: {
+ 			bool process = false;
+-			int copy, len;
++			int len;
+ 			/* Non function dereferences were already done */
+ 			switch (*fmt) {
+ 			case 'S':
+@@ -3279,17 +3178,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+ 					break;
+ 				}
+ 				/* Pointer dereference was already processed */
+-				if (str < end) {
+-					len = copy = strlen(args);
+-					if (copy > end - str)
+-						copy = end - str;
+-					memcpy(str, args, copy);
+-					str += len;
+-					args += len + 1;
+-				}
++				len = strlen(args);
++				prt_bytes(out, args, len);
++				args += len + 1;
+ 			}
+ 			if (process)
+-				str = pointer(fmt, str, end, get_arg(void *), spec);
++				pointer(out, fmt, get_arg(void *), spec);
+ 
+ 			while (isalnum(*fmt))
+ 				fmt++;
+@@ -3297,9 +3191,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+ 		}
+ 
+ 		case FORMAT_TYPE_PERCENT_CHAR:
+-			if (str < end)
+-				*str = '%';
+-			++str;
++			__prt_char(out, '%');
+ 			break;
+ 
+ 		case FORMAT_TYPE_INVALID:
+@@ -3342,23 +3234,87 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+ 				num = get_arg(int);
+ 			}
+ 
+-			str = number(str, end, num, spec);
++			number(out, num, spec);
+ 		} /* default: */
+ 		} /* switch(spec.type) */
+ 	} /* while(*fmt) */
+ 
+ out:
+-	if (size > 0) {
+-		if (str < end)
+-			*str = '\0';
+-		else
+-			end[-1] = '\0';
+-	}
+-
+ #undef get_arg
++	printbuf_nul_terminate(out);
++}
++EXPORT_SYMBOL_GPL(prt_bstrprintf);
++
++/**
++ * prt_bprintf - Parse a format string and place args' binary value in a buffer
++ * @out: The buffer to place args' binary value
++ * @fmt: The format string to use
++ * @...: Arguments for the format string
++ */
++void prt_bprintf(struct printbuf *out, const char *fmt, ...)
++{
++	va_list args;
++
++	va_start(args, fmt);
++	prt_vbinprintf(out, fmt, args);
++	va_end(args);
++}
++EXPORT_SYMBOL_GPL(prt_bprintf);
++
++/**
++ * vbin_printf - Parse a format string and place args' binary value in a buffer
++ * @bin_buf: The buffer to place args' binary value
++ * @size: The size of the buffer(by words(32bits), not characters)
++ * @fmt: The format string to use
++ * @args: Arguments for the format string
++ *
++ * The format follows C99 vsnprintf, except %n is ignored, and its argument
++ * is skipped.
++ *
++ * The return value is the number of words(32bits) which would be generated for
++ * the given input.
++ *
++ * NOTE:
++ * If the return value is greater than @size, the resulting bin_buf is NOT
++ * valid for bstr_printf().
++ */
++int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
++{
++	struct printbuf out = PRINTBUF_EXTERN((char *) bin_buf, size);
++
++	prt_vbinprintf(&out, fmt, args);
++	return out.pos;
++}
++EXPORT_SYMBOL_GPL(vbin_printf);
++
++/**
++ * bstr_printf - Format a string from binary arguments and place it in a buffer
++ * @buf: The buffer to place the result into
++ * @size: The size of the buffer, including the trailing null space
++ * @fmt: The format string to use
++ * @bin_buf: Binary arguments for the format string
++ *
++ * This function like C99 vsnprintf, but the difference is that vsnprintf gets
++ * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
++ * a binary buffer that generated by vbin_printf.
++ *
++ * The format follows C99 vsnprintf, but has some extensions:
++ *  see vsnprintf comment for details.
++ *
++ * The return value is the number of characters which would
++ * be generated for the given input, excluding the trailing
++ * '\0', as per ISO C99. If you want to have the exact
++ * number of characters written into @buf as return value
++ * (not including the trailing '\0'), use vscnprintf(). If the
++ * return is greater than or equal to @size, the resulting
++ * string is truncated.
++ */
++int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
++{
++	struct printbuf out = PRINTBUF_EXTERN(buf, size);
+ 
+-	/* the trailing null byte doesn't count towards the total */
+-	return str - buf;
++	prt_bstrprintf(&out, fmt, bin_buf);
++	return out.pos;
+ }
+ EXPORT_SYMBOL_GPL(bstr_printf);
+ 
+diff --git a/mm/Makefile b/mm/Makefile
+index 6f9ffa968a1a..9731f495bbce 100644
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -54,7 +54,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
+ 			   mm_init.o percpu.o slab_common.o \
+ 			   compaction.o vmacache.o \
+ 			   interval_tree.o list_lru.o workingset.o \
+-			   debug.o gup.o mmap_lock.o $(mmu-y)
++			   debug.o gup.o mmap_lock.o show_mem.o $(mmu-y)
+ 
+ # Give 'page_alloc' its own module-parameter namespace
+ page-alloc-y := page_alloc.o
+diff --git a/mm/filemap.c b/mm/filemap.c
+index ffdfbc8b0e3c..8b9e18f79f2b 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -2223,6 +2223,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+ 
+ 	return ret;
+ }
++EXPORT_SYMBOL(find_get_pages_range);
+ 
+ /**
+  * find_get_pages_contig - gang contiguous pagecache lookup
+diff --git a/mm/memcontrol.c b/mm/memcontrol.c
+index 618c366a2f07..660ddd48267d 100644
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -62,7 +62,7 @@
+ #include <linux/file.h>
+ #include <linux/resume_user_mode.h>
+ #include <linux/psi.h>
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ #include "internal.h"
+ #include <net/sock.h>
+ #include <net/ip.h>
+@@ -1462,13 +1462,9 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
+ 
+ static char *memory_stat_format(struct mem_cgroup *memcg)
+ {
+-	struct seq_buf s;
++	struct printbuf buf = PRINTBUF;
+ 	int i;
+ 
+-	seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
+-	if (!s.buffer)
+-		return NULL;
+-
+ 	/*
+ 	 * Provide statistics on the state of the memory subsystem as
+ 	 * well as cumulative event counters that show past behavior.
+@@ -1485,37 +1481,37 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
+ 		u64 size;
+ 
+ 		size = memcg_page_state_output(memcg, memory_stats[i].idx);
+-		seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
++		prt_printf(&buf, "%s %llu\n", memory_stats[i].name, size);
+ 
+ 		if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
+ 			size += memcg_page_state_output(memcg,
+ 							NR_SLAB_RECLAIMABLE_B);
+-			seq_buf_printf(&s, "slab %llu\n", size);
++			prt_printf(&buf, "slab %llu\n", size);
+ 		}
+ 	}
+ 
+ 	/* Accumulated memory events */
+ 
+-	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
+-		       memcg_events(memcg, PGFAULT));
+-	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
+-		       memcg_events(memcg, PGMAJFAULT));
+-	seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
+-		       memcg_events(memcg, PGREFILL));
+-	seq_buf_printf(&s, "pgscan %lu\n",
+-		       memcg_events(memcg, PGSCAN_KSWAPD) +
+-		       memcg_events(memcg, PGSCAN_DIRECT));
+-	seq_buf_printf(&s, "pgsteal %lu\n",
+-		       memcg_events(memcg, PGSTEAL_KSWAPD) +
+-		       memcg_events(memcg, PGSTEAL_DIRECT));
+-	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
+-		       memcg_events(memcg, PGACTIVATE));
+-	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
+-		       memcg_events(memcg, PGDEACTIVATE));
+-	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
+-		       memcg_events(memcg, PGLAZYFREE));
+-	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
+-		       memcg_events(memcg, PGLAZYFREED));
++	prt_printf(&buf, "%s %lu\n", vm_event_name(PGFAULT),
++	       memcg_events(memcg, PGFAULT));
++	prt_printf(&buf, "%s %lu\n", vm_event_name(PGMAJFAULT),
++	       memcg_events(memcg, PGMAJFAULT));
++	prt_printf(&buf, "%s %lu\n",  vm_event_name(PGREFILL),
++	       memcg_events(memcg, PGREFILL));
++	prt_printf(&buf, "pgscan %lu\n",
++	       memcg_events(memcg, PGSCAN_KSWAPD) +
++	       memcg_events(memcg, PGSCAN_DIRECT));
++	prt_printf(&buf, "pgsteal %lu\n",
++	       memcg_events(memcg, PGSTEAL_KSWAPD) +
++	       memcg_events(memcg, PGSTEAL_DIRECT));
++	prt_printf(&buf, "%s %lu\n", vm_event_name(PGACTIVATE),
++	       memcg_events(memcg, PGACTIVATE));
++	prt_printf(&buf, "%s %lu\n", vm_event_name(PGDEACTIVATE),
++	       memcg_events(memcg, PGDEACTIVATE));
++	prt_printf(&buf, "%s %lu\n", vm_event_name(PGLAZYFREE),
++	       memcg_events(memcg, PGLAZYFREE));
++	prt_printf(&buf, "%s %lu\n", vm_event_name(PGLAZYFREED),
++	       memcg_events(memcg, PGLAZYFREED));
+ 
+ #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ 	seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN),
+@@ -1525,16 +1521,18 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
+ #endif
+ 
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+-	seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
+-		       memcg_events(memcg, THP_FAULT_ALLOC));
+-	seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
+-		       memcg_events(memcg, THP_COLLAPSE_ALLOC));
++	prt_printf(&buf, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
++	       memcg_events(memcg, THP_FAULT_ALLOC));
++	prt_printf(&buf, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
++	       memcg_events(memcg, THP_COLLAPSE_ALLOC));
+ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ 
+-	/* The above should easily fit into one page */
+-	WARN_ON_ONCE(seq_buf_has_overflowed(&s));
++	if (buf.allocation_failure) {
++		printbuf_exit(&buf);
++		return NULL;
++	}
+ 
+-	return s.buffer;
++	return buf.buf;
+ }
+ 
+ #define K(x) ((x) << (PAGE_SHIFT-10))
+diff --git a/mm/nommu.c b/mm/nommu.c
+index 9d7afc2d959e..dd53020262d8 100644
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -281,6 +281,24 @@ void *vzalloc_node(unsigned long size, int node)
+ }
+ EXPORT_SYMBOL(vzalloc_node);
+ 
++/**
++ *	vmalloc_exec  -  allocate virtually contiguous, executable memory
++ *	@size:		allocation size
++ *
++ *	Kernel-internal function to allocate enough pages to cover @size
++ *	the page level allocator and map them into contiguous and
++ *	executable kernel virtual space.
++ *
++ *	For tight control over page level allocator and protection flags
++ *	use __vmalloc() instead.
++ */
++
++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
++{
++	return __vmalloc(size, gfp_mask);
++}
++EXPORT_SYMBOL_GPL(vmalloc_exec);
++
+ /**
+  * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
+  *	@size:		allocation size
+diff --git a/mm/oom_kill.c b/mm/oom_kill.c
+index 3c6cf9e3cd66..e4dca11dc54a 100644
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -168,27 +168,6 @@ static bool oom_unkillable_task(struct task_struct *p)
+ 	return false;
+ }
+ 
+-/*
+- * Check whether unreclaimable slab amount is greater than
+- * all user memory(LRU pages).
+- * dump_unreclaimable_slab() could help in the case that
+- * oom due to too much unreclaimable slab used by kernel.
+-*/
+-static bool should_dump_unreclaim_slab(void)
+-{
+-	unsigned long nr_lru;
+-
+-	nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
+-		 global_node_page_state(NR_INACTIVE_ANON) +
+-		 global_node_page_state(NR_ACTIVE_FILE) +
+-		 global_node_page_state(NR_INACTIVE_FILE) +
+-		 global_node_page_state(NR_ISOLATED_ANON) +
+-		 global_node_page_state(NR_ISOLATED_FILE) +
+-		 global_node_page_state(NR_UNEVICTABLE);
+-
+-	return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
+-}
+-
+ /**
+  * oom_badness - heuristic function to determine which candidate task to kill
+  * @p: task struct of which task we should calculate
+@@ -462,8 +441,6 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
+ 		mem_cgroup_print_oom_meminfo(oc->memcg);
+ 	else {
+ 		show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
+-		if (should_dump_unreclaim_slab())
+-			dump_unreclaimable_slab();
+ 	}
+ 	if (sysctl_oom_dump_tasks)
+ 		dump_tasks(oc);
+diff --git a/lib/show_mem.c b/mm/show_mem.c
+similarity index 83%
+rename from lib/show_mem.c
+rename to mm/show_mem.c
+index 1c26c14ffbb9..47225158ce49 100644
+--- a/lib/show_mem.c
++++ b/mm/show_mem.c
+@@ -7,6 +7,9 @@
+ 
+ #include <linux/mm.h>
+ #include <linux/cma.h>
++#include <linux/printbuf.h>
++
++#include "slab.h"
+ 
+ void show_mem(unsigned int filter, nodemask_t *nodemask)
+ {
+@@ -41,4 +44,9 @@ void show_mem(unsigned int filter, nodemask_t *nodemask)
+ #ifdef CONFIG_MEMORY_FAILURE
+ 	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
+ #endif
++	printk("Unreclaimable slab info:\n");
++	printk("%pf()", CALL_PP(dump_unreclaimable_slab));
++
++	printk("Shrinkers:\n");
++	printk("%pf()", CALL_PP(shrinkers_to_text));
+ }
+diff --git a/mm/slab.h b/mm/slab.h
+index db9fb5c8dae7..502616394f7f 100644
+--- a/mm/slab.h
++++ b/mm/slab.h
+@@ -806,10 +806,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
+ 
+ #endif
+ 
++struct printbuf;
++
+ #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
+-void dump_unreclaimable_slab(void);
++void dump_unreclaimable_slab(struct printbuf *);
+ #else
+-static inline void dump_unreclaimable_slab(void)
++static inline void dump_unreclaimable_slab(struct printbuf *out)
+ {
+ }
+ #endif
+diff --git a/mm/slab_common.c b/mm/slab_common.c
+index 77c3adf40e50..3be0d468a599 100644
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -25,6 +25,7 @@
+ #include <asm/page.h>
+ #include <linux/memcontrol.h>
+ #include <linux/stackdepot.h>
++#include <linux/printbuf.h>
+ 
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/kmem.h>
+@@ -1085,10 +1086,15 @@ static int slab_show(struct seq_file *m, void *p)
+ 	return 0;
+ }
+ 
+-void dump_unreclaimable_slab(void)
++void dump_unreclaimable_slab(struct printbuf *out)
+ {
+ 	struct kmem_cache *s;
+ 	struct slabinfo sinfo;
++	struct slab_by_mem {
++		struct kmem_cache *s;
++		size_t total, active;
++	} slabs_by_mem[10], n;
++	int i, nr = 0;
+ 
+ 	/*
+ 	 * Here acquiring slab_mutex is risky since we don't prefer to get
+@@ -1098,12 +1104,11 @@ void dump_unreclaimable_slab(void)
+ 	 * without acquiring the mutex.
+ 	 */
+ 	if (!mutex_trylock(&slab_mutex)) {
+-		pr_warn("excessive unreclaimable slab but cannot dump stats\n");
++		prt_str(out, "excessive unreclaimable slab but cannot dump stats\n");
+ 		return;
+ 	}
+ 
+-	pr_info("Unreclaimable slab info:\n");
+-	pr_info("Name                      Used          Total\n");
++	printbuf_atomic_inc(out);
+ 
+ 	list_for_each_entry(s, &slab_caches, list) {
+ 		if (s->flags & SLAB_RECLAIM_ACCOUNT)
+@@ -1111,11 +1116,43 @@ void dump_unreclaimable_slab(void)
+ 
+ 		get_slabinfo(s, &sinfo);
+ 
+-		if (sinfo.num_objs > 0)
+-			pr_info("%-17s %10luKB %10luKB\n", s->name,
+-				(sinfo.active_objs * s->size) / 1024,
+-				(sinfo.num_objs * s->size) / 1024);
++		if (!sinfo.num_objs)
++			continue;
++
++		n.s = s;
++		n.total = sinfo.num_objs * s->size;
++		n.active = sinfo.active_objs * s->size;
++
++		for (i = 0; i < nr; i++)
++			if (n.total < slabs_by_mem[i].total)
++				break;
++
++		if (nr < ARRAY_SIZE(slabs_by_mem)) {
++			memmove(&slabs_by_mem[i + 1],
++				&slabs_by_mem[i],
++				sizeof(slabs_by_mem[0]) * (nr - i));
++			nr++;
++		} else if (i) {
++			i--;
++			memmove(&slabs_by_mem[0],
++				&slabs_by_mem[1],
++				sizeof(slabs_by_mem[0]) * i);
++		} else {
++			continue;
++		}
++
++		slabs_by_mem[i] = n;
++	}
++
++	for (i = nr - 1; i >= 0; --i) {
++		prt_printf(out, "%-17s total: ", slabs_by_mem[i].s->name);
++		prt_human_readable_u64(out, slabs_by_mem[i].total);
++		prt_printf(out, " active: ");
++		prt_human_readable_u64(out, slabs_by_mem[i].active);
++		prt_newline(out);
+ 	}
++
++	printbuf_atomic_dec(out);
+ 	mutex_unlock(&slab_mutex);
+ }
+ 
+diff --git a/mm/vmalloc.c b/mm/vmalloc.c
+index effd1ff6a4b4..ea6375c960a2 100644
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -3361,6 +3361,27 @@ void *vzalloc_node(unsigned long size, int node)
+ }
+ EXPORT_SYMBOL(vzalloc_node);
+ 
++/**
++ * vmalloc_exec - allocate virtually contiguous, executable memory
++ * @size:	  allocation size
++ *
++ * Kernel-internal function to allocate enough pages to cover @size
++ * the page level allocator and map them into contiguous and
++ * executable kernel virtual space.
++ *
++ * For tight control over page level allocator and protection flags
++ * use __vmalloc() instead.
++ *
++ * Return: pointer to the allocated memory or %NULL on error
++ */
++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
++{
++	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
++			gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
++			NUMA_NO_NODE, __builtin_return_address(0));
++}
++EXPORT_SYMBOL_GPL(vmalloc_exec);
++
+ #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
+ #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
+ #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index f7d9a683e3a7..0ea3ce8e258f 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -50,6 +50,7 @@
+ #include <linux/printk.h>
+ #include <linux/dax.h>
+ #include <linux/psi.h>
++#include <linux/printbuf.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -699,6 +700,89 @@ void synchronize_shrinkers(void)
+ }
+ EXPORT_SYMBOL(synchronize_shrinkers);
+ 
++void shrinker_to_text(struct printbuf *out, struct shrinker *shrinker)
++{
++	struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
++
++	if (shrinker->name[0])
++		prt_str(out, shrinker->name);
++	else
++		prt_printf(out, "%ps:", shrinker->scan_objects);
++	prt_newline(out);
++	printbuf_indent_add(out, 2);
++
++	prt_printf(out, "objects:           %lu", shrinker->count_objects(shrinker, &sc));
++	prt_newline(out);
++	prt_printf(out, "requested to free: %lu", atomic_long_read(&shrinker->objects_requested_to_free));
++	prt_newline(out);
++	prt_printf(out, "objects freed:     %lu", atomic_long_read(&shrinker->objects_freed));
++	prt_newline(out);
++
++	if (shrinker->to_text) {
++		shrinker->to_text(out, shrinker);
++		prt_newline(out);
++	}
++
++	printbuf_indent_sub(out, 2);
++}
++
++/**
++ * shrinkers_to_text - Report on shrinkers with highest usage
++ *
++ * This reports on the top 10 shrinkers, by object counts, in sorted order:
++ * intended to be used for OOM reporting.
++ */
++void shrinkers_to_text(struct printbuf *out)
++{
++	struct shrinker *shrinker;
++	struct shrinker_by_mem {
++		struct shrinker	*shrinker;
++		unsigned long	mem;
++	} shrinkers_by_mem[10];
++	int i, nr = 0;
++
++	if (!down_read_trylock(&shrinker_rwsem)) {
++		prt_str(out, "(couldn't take shrinker lock)");
++		return;
++	}
++
++	list_for_each_entry(shrinker, &shrinker_list, list) {
++		struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
++		unsigned long mem = shrinker->count_objects(shrinker, &sc);
++
++		if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY)
++			continue;
++
++		for (i = 0; i < nr; i++)
++			if (mem < shrinkers_by_mem[i].mem)
++				break;
++
++		if (nr < ARRAY_SIZE(shrinkers_by_mem)) {
++			memmove(&shrinkers_by_mem[i + 1],
++				&shrinkers_by_mem[i],
++				sizeof(shrinkers_by_mem[0]) * (nr - i));
++			nr++;
++		} else if (i) {
++			i--;
++			memmove(&shrinkers_by_mem[0],
++				&shrinkers_by_mem[1],
++				sizeof(shrinkers_by_mem[0]) * i);
++		} else {
++			continue;
++		}
++
++		shrinkers_by_mem[i] = (struct shrinker_by_mem) {
++			.shrinker = shrinker,
++			.mem = mem,
++		};
++	}
++
++	for (i = nr - 1; i >= 0; --i)
++		shrinker_to_text(out, shrinkers_by_mem[i].shrinker);
++
++	up_read(&shrinker_rwsem);
++}
++
+ #define SHRINK_BATCH 128
+ 
+ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+@@ -765,12 +849,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+ 		unsigned long ret;
+ 		unsigned long nr_to_scan = min(batch_size, total_scan);
+ 
++		atomic_long_add(nr_to_scan, &shrinker->objects_requested_to_free);
++
+ 		shrinkctl->nr_to_scan = nr_to_scan;
+ 		shrinkctl->nr_scanned = nr_to_scan;
+ 		ret = shrinker->scan_objects(shrinker, shrinkctl);
+ 		if (ret == SHRINK_STOP)
+ 			break;
++
+ 		freed += ret;
++		atomic_long_add(ret, &shrinker->objects_freed);
+ 
+ 		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
+ 		total_scan -= shrinkctl->nr_scanned;
+diff --git a/net/9p/client.c b/net/9p/client.c
+index 8bba0d9cf975..e14074d031c6 100644
+--- a/net/9p/client.c
++++ b/net/9p/client.c
+@@ -218,23 +218,29 @@ static int parse_opts(char *opts, struct p9_client *clnt)
+ 	return ret;
+ }
+ 
+-static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
+-			 int alloc_msize)
++static void p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
++			  int fc_idx, unsigned alloc_msize)
+ {
+-	if (likely(c->fcall_cache) && alloc_msize == c->msize) {
+-		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
+-		fc->cache = c->fcall_cache;
+-	} else {
+-		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
+-		fc->cache = NULL;
+-	}
+-	if (!fc->sdata)
+-		return -ENOMEM;
++	gfp_t gfp = GFP_NOFS|__GFP_NOWARN;
++
++	BUG_ON(alloc_msize > c->msize);
++
++	fc->sdata = NULL;
++	fc->used_mempool = false;
+ 	fc->capacity = alloc_msize;
+-	return 0;
++
++	if (alloc_msize < c->msize)
++		fc->sdata = kmalloc(alloc_msize, gfp);
++
++	if (!fc->sdata) {
++		fc->sdata = mempool_alloc(&c->pools[fc_idx], gfp);
++		fc->used_mempool = true;
++		fc->capacity = c->msize;
++	}
+ }
+ 
+-void p9_fcall_fini(struct p9_fcall *fc)
++void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc,
++		   int fc_idx)
+ {
+ 	/* sdata can be NULL for interrupted requests in trans_rdma,
+ 	 * and kmem_cache_free does not do NULL-check for us
+@@ -242,8 +248,8 @@ void p9_fcall_fini(struct p9_fcall *fc)
+ 	if (unlikely(!fc->sdata))
+ 		return;
+ 
+-	if (fc->cache)
+-		kmem_cache_free(fc->cache, fc->sdata);
++	if (fc->used_mempool)
++		mempool_free(fc->sdata, &c->pools[fc_idx]);
+ 	else
+ 		kfree(fc->sdata);
+ }
+@@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
+ 	if (!req)
+ 		return ERR_PTR(-ENOMEM);
+ 
+-	if (p9_fcall_init(c, &req->tc, alloc_msize))
+-		goto free_req;
+-	if (p9_fcall_init(c, &req->rc, alloc_msize))
+-		goto free;
++	p9_fcall_init(c, &req->tc, 0, alloc_msize);
++	p9_fcall_init(c, &req->rc, 1, alloc_msize);
+ 
+ 	p9pdu_reset(&req->tc);
+ 	p9pdu_reset(&req->rc);
+@@ -305,14 +309,13 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
+ 	 * callback), so p9_client_cb eats the second ref there
+ 	 * as the pointer is duplicated directly by virtqueue_add_sgs()
+ 	 */
+-	refcount_set(&req->refcount.refcount, 2);
++	refcount_set(&req->refcount, 2);
+ 
+ 	return req;
+ 
+ free:
+-	p9_fcall_fini(&req->tc);
+-	p9_fcall_fini(&req->rc);
+-free_req:
++	p9_fcall_fini(c, &req->tc, 0);
++	p9_fcall_fini(c, &req->rc, 1);
+ 	kmem_cache_free(p9_req_cache, req);
+ 	return ERR_PTR(-ENOMEM);
+ }
+@@ -341,7 +344,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
+ 		if (!p9_req_try_get(req))
+ 			goto again;
+ 		if (req->tc.tag != tag) {
+-			p9_req_put(req);
++			p9_req_put(c, req);
+ 			goto again;
+ 		}
+ 	}
+@@ -367,21 +370,18 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
+ 	spin_lock_irqsave(&c->lock, flags);
+ 	idr_remove(&c->reqs, tag);
+ 	spin_unlock_irqrestore(&c->lock, flags);
+-	return p9_req_put(r);
+-}
+-
+-static void p9_req_free(struct kref *ref)
+-{
+-	struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount);
+-
+-	p9_fcall_fini(&r->tc);
+-	p9_fcall_fini(&r->rc);
+-	kmem_cache_free(p9_req_cache, r);
++	return p9_req_put(c, r);
+ }
+ 
+-int p9_req_put(struct p9_req_t *r)
++int p9_req_put(struct p9_client *c, struct p9_req_t *r)
+ {
+-	return kref_put(&r->refcount, p9_req_free);
++	if (refcount_dec_and_test(&r->refcount)) {
++		p9_fcall_fini(c, &r->tc, 0);
++		p9_fcall_fini(c, &r->rc, 1);
++		kmem_cache_free(p9_req_cache, r);
++		return 1;
++	}
++	return 0;
+ }
+ EXPORT_SYMBOL(p9_req_put);
+ 
+@@ -426,7 +426,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
+ 
+ 	wake_up(&req->wq);
+ 	p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
+-	p9_req_put(req);
++	p9_req_put(c, req);
+ }
+ EXPORT_SYMBOL(p9_client_cb);
+ 
+@@ -709,7 +709,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
+ reterr:
+ 	p9_tag_remove(c, req);
+ 	/* We have to put also the 2nd reference as it won't be used */
+-	p9_req_put(req);
++	p9_req_put(c, req);
+ 	return ERR_PTR(err);
+ }
+ 
+@@ -746,7 +746,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
+ 	err = c->trans_mod->request(c, req);
+ 	if (err < 0) {
+ 		/* write won't happen */
+-		p9_req_put(req);
++		p9_req_put(c, req);
+ 		if (err != -ERESTARTSYS && err != -EFAULT)
+ 			c->status = Disconnected;
+ 		goto recalc_sigpending;
+@@ -1002,7 +1002,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
+ 	char *client_id;
+ 
+ 	err = 0;
+-	clnt = kmalloc(sizeof(*clnt), GFP_KERNEL);
++	clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
+ 	if (!clnt)
+ 		return ERR_PTR(-ENOMEM);
+ 
+@@ -1053,10 +1053,6 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
+ 		goto close_trans;
+ 	}
+ 
+-	err = p9_client_version(clnt);
+-	if (err)
+-		goto close_trans;
+-
+ 	/* P9_HDRSZ + 4 is the smallest packet header we can have that is
+ 	 * followed by data accessed from userspace by read
+ 	 */
+@@ -1066,6 +1062,15 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
+ 					   clnt->msize - (P9_HDRSZ + 4),
+ 					   NULL);
+ 
++	err =   mempool_init_slab_pool(&clnt->pools[0], 4, clnt->fcall_cache) ?:
++		mempool_init_slab_pool(&clnt->pools[1], 4, clnt->fcall_cache);
++	if (err)
++		goto close_trans;
++
++	err = p9_client_version(clnt);
++	if (err)
++		goto close_trans;
++
+ 	return clnt;
+ 
+ close_trans:
+@@ -1073,6 +1078,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
+ put_trans:
+ 	v9fs_put_trans(clnt->trans_mod);
+ free_client:
++	mempool_exit(&clnt->pools[1]);
++	mempool_exit(&clnt->pools[0]);
+ 	kfree(clnt);
+ 	return ERR_PTR(err);
+ }
+@@ -1097,6 +1104,8 @@ void p9_client_destroy(struct p9_client *clnt)
+ 
+ 	p9_tag_cleanup(clnt);
+ 
++	mempool_exit(&clnt->pools[1]);
++	mempool_exit(&clnt->pools[0]);
+ 	kmem_cache_destroy(clnt->fcall_cache);
+ 	kfree(clnt);
+ }
+diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
+index 8f8f95e39b03..007c3f45fe05 100644
+--- a/net/9p/trans_fd.c
++++ b/net/9p/trans_fd.c
+@@ -378,7 +378,7 @@ static void p9_read_work(struct work_struct *work)
+ 		m->rc.sdata = NULL;
+ 		m->rc.offset = 0;
+ 		m->rc.capacity = 0;
+-		p9_req_put(m->rreq);
++		p9_req_put(m->client, m->rreq);
+ 		m->rreq = NULL;
+ 	}
+ 
+@@ -492,7 +492,7 @@ static void p9_write_work(struct work_struct *work)
+ 	m->wpos += err;
+ 	if (m->wpos == m->wsize) {
+ 		m->wpos = m->wsize = 0;
+-		p9_req_put(m->wreq);
++		p9_req_put(m->client, m->wreq);
+ 		m->wreq = NULL;
+ 	}
+ 
+@@ -695,7 +695,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
+ 	if (req->status == REQ_STATUS_UNSENT) {
+ 		list_del(&req->req_list);
+ 		req->status = REQ_STATUS_FLSHD;
+-		p9_req_put(req);
++		p9_req_put(client, req);
+ 		ret = 0;
+ 	}
+ 	spin_unlock(&client->lock);
+@@ -722,7 +722,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
+ 	list_del(&req->req_list);
+ 	req->status = REQ_STATUS_FLSHD;
+ 	spin_unlock(&client->lock);
+-	p9_req_put(req);
++	p9_req_put(client, req);
+ 
+ 	return 0;
+ }
+@@ -883,12 +883,12 @@ static void p9_conn_destroy(struct p9_conn *m)
+ 	p9_mux_poll_stop(m);
+ 	cancel_work_sync(&m->rq);
+ 	if (m->rreq) {
+-		p9_req_put(m->rreq);
++		p9_req_put(m->client, m->rreq);
+ 		m->rreq = NULL;
+ 	}
+ 	cancel_work_sync(&m->wq);
+ 	if (m->wreq) {
+-		p9_req_put(m->wreq);
++		p9_req_put(m->client, m->wreq);
+ 		m->wreq = NULL;
+ 	}
+ 
+diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
+index 88e563826674..99d878d70d56 100644
+--- a/net/9p/trans_rdma.c
++++ b/net/9p/trans_rdma.c
+@@ -350,7 +350,7 @@ send_done(struct ib_cq *cq, struct ib_wc *wc)
+ 			    c->busa, c->req->tc.size,
+ 			    DMA_TO_DEVICE);
+ 	up(&rdma->sq_sem);
+-	p9_req_put(c->req);
++	p9_req_put(client, c->req);
+ 	kfree(c);
+ }
+ 
+@@ -431,7 +431,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
+ 	if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
+ 		if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
+ 			/* Got one! */
+-			p9_fcall_fini(&req->rc);
++			p9_fcall_fini(client, &req->rc, 1);
+ 			req->rc.sdata = NULL;
+ 			goto dont_need_post_recv;
+ 		} else {
+diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
+index b24a4fb0f0a2..147972bf2e79 100644
+--- a/net/9p/trans_virtio.c
++++ b/net/9p/trans_virtio.c
+@@ -199,7 +199,7 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
+ /* Reply won't come, so drop req ref */
+ static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
+ {
+-	p9_req_put(req);
++	p9_req_put(client, req);
+ 	return 0;
+ }
+ 
+@@ -523,7 +523,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
+ 	kvfree(out_pages);
+ 	if (!kicked) {
+ 		/* reply won't come */
+-		p9_req_put(req);
++		p9_req_put(client, req);
+ 	}
+ 	return err;
+ }
+diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
+index 833cd3792c51..227f89cc7237 100644
+--- a/net/9p/trans_xen.c
++++ b/net/9p/trans_xen.c
+@@ -163,7 +163,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
+ 	ring->intf->out_prod = prod;
+ 	spin_unlock_irqrestore(&ring->lock, flags);
+ 	notify_remote_via_irq(ring->irq);
+-	p9_req_put(p9_req);
++	p9_req_put(client, p9_req);
+ 
+ 	return 0;
+ }
+diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c
+index 4d1a947367f9..a2097955dace 100644
+--- a/tools/testing/nvdimm/test/ndtest.c
++++ b/tools/testing/nvdimm/test/ndtest.c
+@@ -12,7 +12,7 @@
+ #include <linux/ndctl.h>
+ #include <nd-core.h>
+ #include <linux/printk.h>
+-#include <linux/seq_buf.h>
++#include <linux/printbuf.h>
+ 
+ #include "../watermark.h"
+ #include "nfit_test.h"
+@@ -740,32 +740,30 @@ static ssize_t flags_show(struct device *dev,
+ {
+ 	struct nvdimm *nvdimm = to_nvdimm(dev);
+ 	struct ndtest_dimm *dimm = nvdimm_provider_data(nvdimm);
+-	struct seq_buf s;
++	struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
+ 	u64 flags;
+ 
+ 	flags = dimm->flags;
+ 
+-	seq_buf_init(&s, buf, PAGE_SIZE);
+ 	if (flags & PAPR_PMEM_UNARMED_MASK)
+-		seq_buf_printf(&s, "not_armed ");
++		prt_printf(&s, "not_armed ");
+ 
+ 	if (flags & PAPR_PMEM_BAD_SHUTDOWN_MASK)
+-		seq_buf_printf(&s, "flush_fail ");
++		prt_printf(&s, "flush_fail ");
+ 
+ 	if (flags & PAPR_PMEM_BAD_RESTORE_MASK)
+-		seq_buf_printf(&s, "restore_fail ");
++		prt_printf(&s, "restore_fail ");
+ 
+ 	if (flags & PAPR_PMEM_SAVE_MASK)
+-		seq_buf_printf(&s, "save_fail ");
++		prt_printf(&s, "save_fail ");
+ 
+ 	if (flags & PAPR_PMEM_SMART_EVENT_MASK)
+-		seq_buf_printf(&s, "smart_notify ");
++		prt_printf(&s, "smart_notify ");
+ 
++	if (printbuf_written(&s))
++		prt_printf(&s, "\n");
+ 
+-	if (seq_buf_used(&s))
+-		seq_buf_printf(&s, "\n");
+-
+-	return seq_buf_used(&s);
++	return printbuf_written(&s);
+ }
+ static DEVICE_ATTR_RO(flags);
+ 
+-- 
+2.37.1
+